diff --git a/.bazelignore b/.bazelignore index 61b5e9458df6e..01fcdd0d8e050 100644 --- a/.bazelignore +++ b/.bazelignore @@ -1,3 +1,4 @@ # We do not use this library in our Bazel build. It contains an # infinitely recursing symlink that makes Bazel very unhappy. third_party/ittapi/ +third_party/opentelemetry-cpp diff --git a/.ci/docker/build.sh b/.ci/docker/build.sh index a00502dd81d24..426f4698c2b00 100755 --- a/.ci/docker/build.sh +++ b/.ci/docker/build.sh @@ -204,7 +204,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - ROCM_VERSION=5.6 + ROCM_VERSION=6.0 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes TRITON=yes @@ -215,7 +215,7 @@ case "$image" in PROTOBUF=yes DB=yes VISION=yes - ROCM_VERSION=5.7 + ROCM_VERSION=6.1 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes TRITON=yes @@ -229,6 +229,7 @@ case "$image" in BASEKIT_VERSION=2024.0.0-49522 NINJA_VERSION=1.9.0 CONDA_CMAKE=yes + TRITON=yes ;; pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks) ANACONDA_PYTHON_VERSION=3.8 @@ -277,6 +278,7 @@ case "$image" in CONDA_CMAKE=yes TRITON=yes DOCS=yes + UNINSTALL_DILL=yes ;; pytorch-linux-jammy-py3-clang12-executorch) ANACONDA_PYTHON_VERSION=3.10 @@ -296,6 +298,21 @@ case "$image" in CUDA_VERSION=11.8 CONDA_CMAKE=yes ;; + pytorch-linux-jammy-aarch64-py3.10-gcc11) + ANACONDA_PYTHON_VERSION=3.10 + GCC_VERSION=11 + ACL=yes + PROTOBUF=yes + DB=yes + VISION=yes + CONDA_CMAKE=yes + # snadampal: skipping sccache due to the following issue + # https://github.com/pytorch/pytorch/issues/121559 + SKIP_SCCACHE_INSTALL=yes + # snadampal: skipping llvm src build install because the current version + # from pytorch/llvm:9.0.1 is x86 specific + SKIP_LLVM_SRC_BUILD_INSTALL=yes + ;; *) # Catch-all for builds that are not hardcoded. PROTOBUF=yes @@ -387,6 +404,9 @@ docker build \ --build-arg "INDUCTOR_BENCHMARKS=${INDUCTOR_BENCHMARKS}" \ --build-arg "EXECUTORCH=${EXECUTORCH}" \ --build-arg "BASEKIT_VERSION=${BASEKIT_VERSION}" \ + --build-arg "ACL=${ACL:-}" \ + --build-arg "SKIP_SCCACHE_INSTALL=${SKIP_SCCACHE_INSTALL:-}" \ + --build-arg "SKIP_LLVM_SRC_BUILD_INSTALL=${SKIP_LLVM_SRC_BUILD_INSTALL:-}" \ -f $(dirname ${DOCKERFILE})/Dockerfile \ -t "$tmp_tag" \ "$@" \ diff --git a/.ci/docker/ci_commit_pins/executorch.txt b/.ci/docker/ci_commit_pins/executorch.txt index 258c01ef0ca02..adf618bf2fedd 100644 --- a/.ci/docker/ci_commit_pins/executorch.txt +++ b/.ci/docker/ci_commit_pins/executorch.txt @@ -1 +1 @@ -663882fe7dc518c04adf3d2ee5ccb7d99f41ade4 +d4b3e5cc607e97afdba79dc90f8ef968142f347c diff --git a/.ci/docker/ci_commit_pins/huggingface.txt b/.ci/docker/ci_commit_pins/huggingface.txt index a5f4dc315ee17..f00d6ca4f9ca7 100644 --- a/.ci/docker/ci_commit_pins/huggingface.txt +++ b/.ci/docker/ci_commit_pins/huggingface.txt @@ -1 +1 @@ -6c26faa159b79a42d7fa46cb66e2d21523351987 +243e186efbf7fb93328dd6b34927a4e8c8f24395 diff --git a/.ci/docker/ci_commit_pins/triton-rocm.txt b/.ci/docker/ci_commit_pins/triton-rocm.txt index 4a873428eaa69..2df035af1fdd7 100644 --- a/.ci/docker/ci_commit_pins/triton-rocm.txt +++ b/.ci/docker/ci_commit_pins/triton-rocm.txt @@ -1 +1 @@ -dafe1459823b9549417ed95e9720f1b594fab329 +bbe6246e37d8aa791c67daaf9d9d61b26c9ccfdc diff --git a/.ci/docker/ci_commit_pins/triton-xpu.txt b/.ci/docker/ci_commit_pins/triton-xpu.txt new file mode 100644 index 0000000000000..36ca144cb6ed5 --- /dev/null +++ b/.ci/docker/ci_commit_pins/triton-xpu.txt @@ -0,0 +1 @@ +b8c64f64c18d8cac598b3adb355c21e7439c21de diff --git a/.ci/docker/ci_commit_pins/triton.txt b/.ci/docker/ci_commit_pins/triton.txt index dc4dffc8b700c..26516efc0b525 100644 --- a/.ci/docker/ci_commit_pins/triton.txt +++ b/.ci/docker/ci_commit_pins/triton.txt @@ -1 +1 @@ -e28a256d71f3cf2bcc7b69d6bda73a9b855e385e +45fff310c891f5a92d55445adf8cc9d29df5841e diff --git a/.ci/docker/common/install_acl.sh b/.ci/docker/common/install_acl.sh new file mode 100644 index 0000000000000..f5e5ce92af4af --- /dev/null +++ b/.ci/docker/common/install_acl.sh @@ -0,0 +1,16 @@ +set -euo pipefail + +readonly version=v23.08 +readonly src_host=https://review.mlplatform.org/ml +readonly src_repo=ComputeLibrary + +# Clone ACL +[[ ! -d ${src_repo} ]] && git clone ${src_host}/${src_repo}.git +cd ${src_repo} + +git checkout $version + +# Build with scons +scons -j8 Werror=0 debug=0 neon=1 opencl=0 embed_kernels=0 \ + os=linux arch=armv8a build=native multi_isa=1 \ + fixed_format_kernels=1 openmp=1 cppthreads=0 diff --git a/.ci/docker/common/install_base.sh b/.ci/docker/common/install_base.sh index e3568b200060b..ebaa17878ade4 100755 --- a/.ci/docker/common/install_base.sh +++ b/.ci/docker/common/install_base.sh @@ -113,7 +113,6 @@ install_centos() { glibc-devel \ glibc-headers \ glog-devel \ - hiredis-devel \ libstdc++-devel \ libsndfile-devel \ make \ @@ -153,7 +152,7 @@ wget https://ossci-linux.s3.amazonaws.com/valgrind-${VALGRIND_VERSION}.tar.bz2 tar -xjf valgrind-${VALGRIND_VERSION}.tar.bz2 cd valgrind-${VALGRIND_VERSION} ./configure --prefix=/usr/local -make -j6 +make -j$[$(nproc) - 2] sudo make install cd ../../ rm -rf valgrind_build diff --git a/.ci/docker/common/install_conda.sh b/.ci/docker/common/install_conda.sh index 2cbb49c6af312..3a4b48c4d7a33 100755 --- a/.ci/docker/common/install_conda.sh +++ b/.ci/docker/common/install_conda.sh @@ -9,10 +9,19 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then MAJOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 1) MINOR_PYTHON_VERSION=$(echo "$ANACONDA_PYTHON_VERSION" | cut -d . -f 2) +if [[ $(uname -m) == "aarch64" ]]; then + BASE_URL="https://github.com/conda-forge/miniforge/releases/latest/download" case "$MAJOR_PYTHON_VERSION" in - 2) - CONDA_FILE="Miniconda2-latest-Linux-x86_64.sh" + 3) + CONDA_FILE="Miniforge3-Linux-aarch64.sh" ;; + *) + echo "Unsupported ANACONDA_PYTHON_VERSION: $ANACONDA_PYTHON_VERSION" + exit 1 + ;; + esac +else + case "$MAJOR_PYTHON_VERSION" in 3) CONDA_FILE="Miniconda3-latest-Linux-x86_64.sh" ;; @@ -21,6 +30,7 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then exit 1 ;; esac +fi mkdir -p /opt/conda chown jenkins:jenkins /opt/conda @@ -47,15 +57,39 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # Uncomment the below when resolved to track the latest conda update # as_jenkins conda update -y -n base conda + if [[ $(uname -m) == "aarch64" ]]; then + export SYSROOT_DEP="sysroot_linux-aarch64=2.17" + else + export SYSROOT_DEP="sysroot_linux-64=2.17" + fi + # Install correct Python version - as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y python="$ANACONDA_PYTHON_VERSION" + # Also ensure sysroot is using a modern GLIBC to match system compilers + as_jenkins conda create -n py_$ANACONDA_PYTHON_VERSION -y\ + python="$ANACONDA_PYTHON_VERSION" \ + ${SYSROOT_DEP} + + # libstdcxx from conda default channels are too old, we need GLIBCXX_3.4.30 + # which is provided in libstdcxx 12 and up. + conda_install libstdcxx-ng=12.3.0 -c conda-forge # Install PyTorch conda deps, as per https://github.com/pytorch/pytorch README - CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools" - if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ]; then - conda_install numpy=1.23.5 ${CONDA_COMMON_DEPS} + if [[ $(uname -m) == "aarch64" ]]; then + CONDA_COMMON_DEPS="astunparse pyyaml setuptools openblas==0.3.25=*openmp* ninja==1.11.1 scons==4.5.2" + + if [ "$ANACONDA_PYTHON_VERSION" = "3.8" ]; then + conda_install numpy=1.24.4 ${CONDA_COMMON_DEPS} + else + conda_install numpy=1.26.2 ${CONDA_COMMON_DEPS} + fi else - conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} + CONDA_COMMON_DEPS="astunparse pyyaml mkl=2021.4.0 mkl-include=2021.4.0 setuptools" + + if [ "$ANACONDA_PYTHON_VERSION" = "3.11" ] || [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then + conda_install numpy=1.26.0 ${CONDA_COMMON_DEPS} + else + conda_install numpy=1.21.2 ${CONDA_COMMON_DEPS} + fi fi # Install llvm-8 as it is required to compile llvmlite-0.30.0 from source @@ -89,14 +123,5 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then pip_install -r /opt/conda/requirements-docs.txt fi - # HACK HACK HACK - # gcc-9 for ubuntu-18.04 from http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu - # Pulls llibstdc++6 13.1.0-8ubuntu1~18.04 which is too new for conda - # So remove libstdc++6.so.3.29 installed by https://anaconda.org/anaconda/libstdcxx-ng/files?version=11.2.0 - # Same is true for gcc-12 from Ubuntu-22.04 - if grep -e [12][82].04.[623] /etc/issue >/dev/null; then - rm /opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/libstdc++.so.6 - fi - popd fi diff --git a/.ci/docker/common/install_db.sh b/.ci/docker/common/install_db.sh index 7e18947acbd3b..7e7234063b917 100755 --- a/.ci/docker/common/install_db.sh +++ b/.ci/docker/common/install_db.sh @@ -4,11 +4,6 @@ set -ex install_ubuntu() { apt-get update - apt-get install -y --no-install-recommends \ - libhiredis-dev \ - libleveldb-dev \ - liblmdb-dev \ - libsnappy-dev # Cleanup apt-get autoclean && apt-get clean @@ -20,12 +15,6 @@ install_centos() { # See http://fedoraproject.org/wiki/EPEL yum --enablerepo=extras install -y epel-release - yum install -y \ - hiredis-devel \ - leveldb-devel \ - lmdb-devel \ - snappy-devel - # Cleanup yum clean all rm -rf /var/cache/yum diff --git a/.ci/docker/common/install_executorch.sh b/.ci/docker/common/install_executorch.sh index e6588098e8a49..a3296dc0df3ed 100755 --- a/.ci/docker/common/install_executorch.sh +++ b/.ci/docker/common/install_executorch.sh @@ -48,7 +48,6 @@ setup_executorch() { install_flatc_from_source pip_install . - build_executorch_runner "cmake" # Make sure that all the newly generate files are owned by Jenkins chown -R jenkins . diff --git a/.ci/docker/common/install_onnx.sh b/.ci/docker/common/install_onnx.sh index de283b18c6fe1..a1a5fde7d2f5b 100755 --- a/.ci/docker/common/install_onnx.sh +++ b/.ci/docker/common/install_onnx.sh @@ -26,18 +26,19 @@ pip_install \ pytest-cov==4.0.0 \ pytest-subtests==0.10.0 \ tabulate==0.9.0 \ - transformers==4.32.1 + transformers==4.36.2 pip_install coloredlogs packaging -retry pip_install -i https://aiinfra.pkgs.visualstudio.com/PublicPackages/_packaging/ORT-Nightly/pypi/simple/ --no-cache-dir --no-input ort-nightly==1.17.0.dev20231005006 -pip_install -i https://test.pypi.org/simple/ onnx==1.15.0rc2 -pip_install onnxscript==0.1.0.dev20231128 --no-deps +pip_install onnxruntime==1.17.0 +pip_install onnx==1.15.0 +# pip_install "onnxscript@git+https://github.com/microsoft/onnxscript@3e869ef8ccf19b5ebd21c10d3e9c267c9a9fa729" --no-deps +pip_install onnxscript==0.1.0.dev20240315 --no-deps # Cache the transformers model to be used later by ONNX tests. We need to run the transformers # package to download the model. By default, the model is cached at ~/.cache/huggingface/hub/ IMPORT_SCRIPT_FILENAME="/tmp/onnx_import_script.py" -as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2");' > "${IMPORT_SCRIPT_FILENAME}" +as_jenkins echo 'import transformers; transformers.AutoModel.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoTokenizer.from_pretrained("sshleifer/tiny-gpt2"); transformers.AutoModelForSpeechSeq2Seq.from_pretrained("openai/whisper-large-v3");' > "${IMPORT_SCRIPT_FILENAME}" # Need a PyTorch version for transformers to work pip_install --pre torch --index-url https://download.pytorch.org/whl/nightly/cpu diff --git a/.ci/docker/common/install_openssl.sh b/.ci/docker/common/install_openssl.sh index 2f645f0bcb5e1..c73c9c333c002 100644 --- a/.ci/docker/common/install_openssl.sh +++ b/.ci/docker/common/install_openssl.sh @@ -9,7 +9,8 @@ tar xf "${OPENSSL}.tar.gz" cd "${OPENSSL}" ./config --prefix=/opt/openssl -d '-Wl,--enable-new-dtags,-rpath,$(LIBRPATH)' # NOTE: openssl install errors out when built with the -j option -make -j6; make install_sw +NPROC=$[$(nproc) - 2] +make -j${NPROC}; make install_sw # Link the ssl libraries to the /usr/lib folder. sudo ln -s /opt/openssl/lib/lib* /usr/lib cd .. diff --git a/.ci/docker/common/install_protobuf.sh b/.ci/docker/common/install_protobuf.sh index 4b7a7a6ac23f7..7c966bcae91d3 100755 --- a/.ci/docker/common/install_protobuf.sh +++ b/.ci/docker/common/install_protobuf.sh @@ -2,55 +2,18 @@ set -ex -# This function installs protobuf 3.17 -install_protobuf_317() { - pb_dir="/usr/temp_pb_install_dir" - mkdir -p $pb_dir +pb_dir="/usr/temp_pb_install_dir" +mkdir -p $pb_dir - # On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or - # else it will fail with - # g++: error: ./../lib64/crti.o: No such file or directory - ln -s /usr/lib64 "$pb_dir/lib64" +# On the nvidia/cuda:9-cudnn7-devel-centos7 image we need this symlink or +# else it will fail with +# g++: error: ./../lib64/crti.o: No such file or directory +ln -s /usr/lib64 "$pb_dir/lib64" - curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3 - tar -xvz -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz - # -j6 to balance memory usage and speed. - # naked `-j` seems to use too much memory. - pushd "$pb_dir" && ./configure && make -j6 && make -j6 check && sudo make -j6 install && sudo ldconfig - popd - rm -rf $pb_dir -} +curl -LO "https://github.com/protocolbuffers/protobuf/releases/download/v3.17.3/protobuf-all-3.17.3.tar.gz" --retry 3 -install_ubuntu() { - # Ubuntu 14.04 has cmake 2.8.12 as the default option, so we will - # install cmake3 here and use cmake3. - apt-get update - if [[ "$UBUNTU_VERSION" == 14.04 ]]; then - apt-get install -y --no-install-recommends cmake3 - fi - - # Cleanup - apt-get autoclean && apt-get clean - rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* - - install_protobuf_317 -} - -install_centos() { - install_protobuf_317 -} - -# Install base packages depending on the base OS -ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') -case "$ID" in - ubuntu) - install_ubuntu - ;; - centos) - install_centos - ;; - *) - echo "Unable to determine OS..." - exit 1 - ;; -esac +tar -xvz --no-same-owner -C "$pb_dir" --strip-components 1 -f protobuf-all-3.17.3.tar.gz +NPROC=$[$(nproc) - 2] +pushd "$pb_dir" && ./configure && make -j${NPROC} && make -j${NPROC} check && sudo make -j${NRPOC} install && sudo ldconfig +popd +rm -rf $pb_dir diff --git a/.ci/docker/common/install_rocm.sh b/.ci/docker/common/install_rocm.sh index caae5c112b581..085304ac7c978 100644 --- a/.ci/docker/common/install_rocm.sh +++ b/.ci/docker/common/install_rocm.sh @@ -61,6 +61,10 @@ install_ubuntu() { rocprofiler-dev \ roctracer-dev + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.1) ]]; then + DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated rocm-llvm-dev + fi + # precompiled miopen kernels added in ROCm 3.5, renamed in ROCm 5.5 # search for all unversioned packages # if search fails it will abort this script; use true to avoid case where search fails @@ -80,6 +84,14 @@ install_ubuntu() { fi fi + # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then + for kdb in /opt/rocm/share/miopen/db/*.kdb + do + sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + done + fi + # Cleanup apt-get autoclean && apt-get clean rm -rf /var/lib/apt/lists/* /tmp/* /var/tmp/* @@ -151,6 +163,14 @@ install_centos() { fi fi + # ROCm 6.0 had a regression where journal_mode was enabled on the kdb files resulting in permission errors at runtime + if [[ $(ver $ROCM_VERSION) -ge $(ver 6.0) ]]; then + for kdb in /opt/rocm/share/miopen/db/*.kdb + do + sqlite3 $kdb "PRAGMA journal_mode=off; PRAGMA VACUUM;" + done + fi + # Cleanup yum clean all rm -rf /var/cache/yum diff --git a/.ci/docker/common/install_rocm_magma.sh b/.ci/docker/common/install_rocm_magma.sh index 457e0ad77361a..94b94661c4606 100644 --- a/.ci/docker/common/install_rocm_magma.sh +++ b/.ci/docker/common/install_rocm_magma.sh @@ -7,7 +7,7 @@ git clone https://bitbucket.org/icl/magma.git pushd magma # Version 2.7.2 + ROCm related updates -git checkout 823531632140d0edcb7e77c3edc0e837421471c5 +git checkout a1625ff4d9bc362906bd01f805dbbe12612953f6 cp make.inc-examples/make.inc.hip-gcc-mkl make.inc echo 'LIBDIR += -L$(MKLROOT)/lib' >> make.inc diff --git a/.ci/docker/common/install_triton.sh b/.ci/docker/common/install_triton.sh index ebde6c3f44761..de009c1a3adbf 100755 --- a/.ci/docker/common/install_triton.sh +++ b/.ci/docker/common/install_triton.sh @@ -13,8 +13,11 @@ conda_reinstall() { } if [ -n "${ROCM_VERSION}" ]; then - TRITON_REPO="https://github.com/ROCmSoftwarePlatform/triton" + TRITON_REPO="https://github.com/openai/triton" TRITON_TEXT_FILE="triton-rocm" +elif [ -n "${BASEKIT_VERSION}" ]; then + TRITON_REPO="https://github.com/intel/intel-xpu-backend-for-triton" + TRITON_TEXT_FILE="triton-xpu" else TRITON_REPO="https://github.com/openai/triton" TRITON_TEXT_FILE="triton" @@ -64,5 +67,6 @@ if [ -n "${CONDA_CMAKE}" ]; then # latest numpy version, which fails ASAN tests with the following import error: Numba # needs NumPy 1.20 or less. conda_reinstall cmake="${CMAKE_VERSION}" - conda_reinstall numpy="${NUMPY_VERSION}" + # Note that we install numpy with pip as conda might not have the version we want + pip_install --force-reinstall numpy=="${NUMPY_VERSION}" fi diff --git a/.ci/docker/common/install_ucc.sh b/.ci/docker/common/install_ucc.sh index 333e44e6f779f..2224811bd987b 100755 --- a/.ci/docker/common/install_ucc.sh +++ b/.ci/docker/common/install_ucc.sh @@ -36,7 +36,12 @@ function install_ucc() { git submodule update --init --recursive ./autogen.sh - ./configure --prefix=$UCC_HOME --with-ucx=$UCX_HOME --with-cuda=$with_cuda + # We only run distributed tests on Tesla M60 and A10G + NVCC_GENCODE="-gencode=arch=compute_52,code=sm_52 -gencode=arch=compute_86,code=compute_86" + ./configure --prefix=$UCC_HOME \ + --with-ucx=$UCX_HOME \ + --with-cuda=$with_cuda \ + --with-nvcc-gencode="${NVCC_GENCODE}" time make -j sudo make install diff --git a/.ci/docker/common/install_xpu.sh b/.ci/docker/common/install_xpu.sh index 813a7c4e278eb..d98ad2049b47c 100644 --- a/.ci/docker/common/install_xpu.sh +++ b/.ci/docker/common/install_xpu.sh @@ -3,7 +3,7 @@ set -xe # IntelĀ® software for general purpose GPU capabilities. -# Refer to https://dgpu-docs.intel.com/releases/stable_647_21_20230714.html +# Refer to https://dgpu-docs.intel.com/releases/LTS_803.29_20240131.html # IntelĀ® oneAPI Base Toolkit (version 2024.0.0) has been updated to include functional and security updates. # Refer to https://www.intel.com/content/www/us/en/developer/tools/oneapi/base-toolkit-download.html @@ -21,7 +21,7 @@ function install_ubuntu() { | gpg --dearmor | tee /usr/share/keyrings/oneapi-archive-keyring.gpg > /dev/null # Add the signed entry to APT sources and configure the APT client to use the Intel repository - echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/production/2328 unified" \ + echo "deb [arch=amd64 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/gpu/ubuntu jammy/lts/2350 unified" \ | tee /etc/apt/sources.list.d/intel-gpu-jammy.list echo "deb [signed-by=/usr/share/keyrings/oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main" \ | tee /etc/apt/sources.list.d/oneAPI.list diff --git a/.ci/docker/requirements-ci.txt b/.ci/docker/requirements-ci.txt index b12cc8c236e66..75852c6b81ce2 100644 --- a/.ci/docker/requirements-ci.txt +++ b/.ci/docker/requirements-ci.txt @@ -15,7 +15,7 @@ click #Pinned versions: #test that import: -coremltools==5.0b5 +coremltools==5.0b5 ; python_version < "3.12" #Description: Apple framework for ML integration #Pinned versions: 5.0b5 #test that import: @@ -25,6 +25,11 @@ coremltools==5.0b5 #Pinned versions: #test that import: +dill==0.3.7 +#Description: dill extends pickle with serializing and de-serializing for most built-ins +#Pinned versions: 0.3.7 +#test that import: dynamo/test_replay_record.py test_dataloader.py test_datapipe.py test_serialization.py + expecttest==0.1.6 #Description: method for writing tests where test framework auto populates # the expected output based on previous runs @@ -47,6 +52,11 @@ junitparser==2.1.1 #Pinned versions: 2.1.1 #test that import: +lark==0.12.0 +#Description: parser +#Pinned versions: 0.12.0 +#test that import: + librosa>=0.6.2 ; python_version < "3.11" #Description: A python package for music and audio analysis #Pinned versions: >=0.6.2 @@ -66,7 +76,7 @@ librosa>=0.6.2 ; python_version < "3.11" #Description: A testing library that allows you to replace parts of your #system under test with mock objects #Pinned versions: -#test that import: test_module_init.py, test_modules.py, test_nn.py, +#test that import: test_modules.py, test_nn.py, #test_testing.py #MonkeyType # breaks pytorch-xla-linux-bionic-py3.7-clang8 @@ -75,10 +85,10 @@ librosa>=0.6.2 ; python_version < "3.11" #Pinned versions: #test that import: -mypy==1.7.0 +mypy==1.9.0 # Pin MyPy version because new errors are likely to appear with each release #Description: linter -#Pinned versions: 1.7.0 +#Pinned versions: 1.9.0 #test that import: test_typing.py, test_type_hints.py networkx==2.8.8 @@ -124,9 +134,9 @@ opt-einsum==3.3 #Pinned versions: 3.3 #test that import: test_linalg.py -optree==0.9.1 +optree==0.11.0 #Description: A library for tree manipulation -#Pinned versions: 0.9.1 +#Pinned versions: 0.11.0 #test that import: test_vmap.py, test_aotdispatch.py, test_dynamic_shapes.py, #test_pytree.py, test_ops.py, test_control_flow.py, test_modules.py, #common_utils.py, test_eager_transforms.py, test_python_dispatch.py, @@ -137,9 +147,9 @@ optree==0.9.1 #test_pointwise_ops.py, test_dtensor_ops.py, test_torchinductor.py, test_fx.py, #test_fake_tensor.py, test_mps.py -pillow==10.0.1 +pillow==10.3.0 #Description: Python Imaging Library fork -#Pinned versions: 10.0.1 +#Pinned versions: 10.3.0 #test that import: protobuf==3.20.2 @@ -162,11 +172,6 @@ pytest-xdist==3.3.1 #Pinned versions: #test that import: -pytest-shard==0.1.2 -#Description: plugin spliting up tests in pytest -#Pinned versions: -#test that import: - pytest-flakefinder==1.1.0 #Description: plugin for rerunning tests a fixed number of times in pytest #Pinned versions: 1.1.0 @@ -223,12 +228,11 @@ scikit-image==0.20.0 ; python_version >= "3.10" #Pinned versions: 0.20.3 #test that import: -scipy==1.6.3 ; python_version < "3.10" -scipy==1.8.1 ; python_version == "3.10" -scipy==1.10.1 ; python_version == "3.11" +scipy==1.10.1 ; python_version <= "3.11" +scipy==1.12.0 ; python_version == "3.12" # Pin SciPy because of failing distribution tests (see #60347) #Description: scientific python -#Pinned versions: 1.6.3 +#Pinned versions: 1.10.1 #test that import: test_unary_ufuncs.py, test_torch.py,test_tensor_creation_ops.py #test_spectral_ops.py, test_sparse_csr.py, test_reductions.py,test_nn.py #test_linalg.py, test_binary_ufuncs.py @@ -243,7 +247,8 @@ tb-nightly==2.13.0a20230426 #Pinned versions: #test that import: -#typing-extensions +# needed by torchgen utils +typing-extensions #Description: type hints for python #Pinned versions: #test that import: @@ -258,9 +263,10 @@ unittest-xml-reporting<=3.2.0,>=2.0.0 #Pinned versions: #test that import: -lintrunner==0.10.7 +#lintrunner is supported on aarch64-linux only from 0.12.4 version +lintrunner==0.12.5 #Description: all about linters! -#Pinned versions: 0.10.7 +#Pinned versions: 0.12.5 #test that import: rockset==1.0.3 @@ -268,14 +274,14 @@ rockset==1.0.3 #Pinned versions: 1.0.3 #test that import: -ghstack==0.7.1 +ghstack==0.8.0 #Description: ghstack tool -#Pinned versions: 0.7.1 +#Pinned versions: 0.8.0 #test that import: -jinja2==3.1.2 +jinja2==3.1.4 #Description: jinja2 template engine -#Pinned versions: 3.1.2 +#Pinned versions: 3.1.4 #test that import: pytest-cpp==2.3.0 @@ -293,7 +299,8 @@ tensorboard==2.13.0 #Pinned versions: #test that import: test_tensorboard -pywavelets==1.4.1 +pywavelets==1.4.1 ; python_version < "3.12" +pywavelets==1.5.0 ; python_version >= "3.12" #Description: This is a requirement of scikit-image, we need to pin # it here because 1.5.0 conflicts with numpy 1.21.2 used in CI #Pinned versions: 1.4.1 diff --git a/.ci/docker/triton_version.txt b/.ci/docker/triton_version.txt index ccbccc3dc6263..4a36342fcab70 100644 --- a/.ci/docker/triton_version.txt +++ b/.ci/docker/triton_version.txt @@ -1 +1 @@ -2.2.0 +3.0.0 diff --git a/.ci/docker/ubuntu-xpu/Dockerfile b/.ci/docker/ubuntu-xpu/Dockerfile index a34cb3b20887f..9a3ff68d159b9 100644 --- a/.ci/docker/ubuntu-xpu/Dockerfile +++ b/.ci/docker/ubuntu-xpu/Dockerfile @@ -61,15 +61,20 @@ COPY ci_commit_pins/timm.txt timm.txt RUN if [ -n "${INDUCTOR_BENCHMARKS}" ]; then bash ./install_inductor_benchmark_deps.sh; fi RUN rm install_inductor_benchmark_deps.sh common_utils.sh timm.txt huggingface.txt +# Install XPU Dependencies +ARG BASEKIT_VERSION +COPY ./common/install_xpu.sh install_xpu.sh +RUN bash ./install_xpu.sh && rm install_xpu.sh + ARG TRITON # Install triton, this needs to be done before sccache because the latter will # try to reach out to S3, which docker build runners don't have access COPY ./common/install_triton.sh install_triton.sh COPY ./common/common_utils.sh common_utils.sh -# TODO: will add triton xpu commit -COPY ci_commit_pins/triton.txt triton.txt +COPY ci_commit_pins/triton-xpu.txt triton-xpu.txt +COPY triton_version.txt triton_version.txt RUN if [ -n "${TRITON}" ]; then bash ./install_triton.sh; fi -RUN rm install_triton.sh common_utils.sh triton.txt +RUN rm install_triton.sh common_utils.sh triton-xpu.txt triton_version.txt # (optional) Install database packages like LMDB and LevelDB ARG DB @@ -85,11 +90,6 @@ RUN if [ -n "${VISION}" ]; then bash ./install_vision.sh; fi RUN rm install_vision.sh cache_vision_models.sh common_utils.sh ENV INSTALLED_VISION ${VISION} -# Install XPU Dependencies -ARG BASEKIT_VERSION -COPY ./common/install_xpu.sh install_xpu.sh -RUN bash ./install_xpu.sh && rm install_xpu.sh - # (optional) Install non-default CMake version ARG CMAKE_VERSION COPY ./common/install_cmake.sh install_cmake.sh diff --git a/.ci/docker/ubuntu/Dockerfile b/.ci/docker/ubuntu/Dockerfile index 0f269e84b09d6..b471ce3b8963c 100644 --- a/.ci/docker/ubuntu/Dockerfile +++ b/.ci/docker/ubuntu/Dockerfile @@ -37,6 +37,7 @@ COPY requirements-ci.txt requirements-docs.txt /opt/conda/ COPY ./common/install_conda.sh install_conda.sh COPY ./common/common_utils.sh common_utils.sh RUN bash ./install_conda.sh && rm install_conda.sh common_utils.sh /opt/conda/requirements-ci.txt /opt/conda/requirements-docs.txt +RUN if [ -n "${UNINSTALL_DILL}" ]; then pip uninstall -y dill; fi # Install gcc ARG GCC_VERSION @@ -160,10 +161,19 @@ COPY ./common/install_onnx.sh ./common/common_utils.sh ./ RUN if [ -n "${ONNX}" ]; then bash ./install_onnx.sh; fi RUN rm install_onnx.sh common_utils.sh +# (optional) Build ACL +ARG ACL +COPY ./common/install_acl.sh install_acl.sh +RUN if [ -n "${ACL}" ]; then bash ./install_acl.sh; fi +RUN rm install_acl.sh +ENV INSTALLED_ACL ${ACL} + # Install ccache/sccache (do this last, so we get priority in PATH) +ARG SKIP_SCCACHE_INSTALL COPY ./common/install_cache.sh install_cache.sh ENV PATH /opt/cache/bin:$PATH -RUN bash ./install_cache.sh && rm install_cache.sh +RUN if [ -z "${SKIP_SCCACHE_INSTALL}" ]; then bash ./install_cache.sh; fi +RUN rm install_cache.sh # Add jni.h for java host build COPY ./common/install_jni.sh install_jni.sh @@ -180,7 +190,9 @@ ARG BUILD_ENVIRONMENT ENV BUILD_ENVIRONMENT ${BUILD_ENVIRONMENT} # Install LLVM dev version (Defined in the pytorch/builder github repository) +ARG SKIP_LLVM_SRC_BUILD_INSTALL COPY --from=pytorch/llvm:9.0.1 /opt/llvm /opt/llvm +RUN if [ -n "${SKIP_LLVM_SRC_BUILD_INSTALL}" ]; then set -eu; rm -rf /opt/llvm; fi # AWS specific CUDA build guidance ENV TORCH_CUDA_ARCH_LIST Maxwell diff --git a/.ci/onnx/common.sh b/.ci/onnx/common.sh index 2c49e3ed3a2e5..3de5836a02858 100644 --- a/.ci/onnx/common.sh +++ b/.ci/onnx/common.sh @@ -1,5 +1,9 @@ +#!/bin/bash + set -ex +source "$(dirname "${BASH_SOURCE[0]}")/../pytorch/common_utils.sh" + LOCAL_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd) ROOT_DIR=$(cd "$LOCAL_DIR"/../.. && pwd) TEST_DIR="$ROOT_DIR/test" diff --git a/.ci/onnx/test.sh b/.ci/onnx/test.sh index 88fad177b7e21..a7d3b72c62a7e 100755 --- a/.ci/onnx/test.sh +++ b/.ci/onnx/test.sh @@ -3,6 +3,20 @@ # shellcheck source=./common.sh source "$(dirname "${BASH_SOURCE[0]}")/common.sh" +# Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) +WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") +cleanup_workspace() { + echo "sudo may print the following warning message that can be ignored. The chown command will still run." + echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted" + echo "For more details refer to https://github.com/sudo-project/sudo/issues/42" + sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace +} +# Disable shellcheck SC2064 as we want to parse the original owner immediately. +# shellcheck disable=SC2064 +trap_add cleanup_workspace EXIT +sudo chown -R jenkins /var/lib/jenkins/workspace +git config --global --add safe.directory /var/lib/jenkins/workspace + if [[ "$BUILD_ENVIRONMENT" == *onnx* ]]; then # TODO: This can be removed later once vision is also part of the Docker image pip install -q --user --no-use-pep517 "git+https://github.com/pytorch/vision.git@$(cat .github/ci_commit_pins/vision.txt)" diff --git a/.ci/pytorch/build.sh b/.ci/pytorch/build.sh index b72461b5a68cb..b81caa0513691 100755 --- a/.ci/pytorch/build.sh +++ b/.ci/pytorch/build.sh @@ -81,7 +81,35 @@ if ! which conda; then export USE_MKLDNN=0 fi else - export CMAKE_PREFIX_PATH=/opt/conda + # CMAKE_PREFIX_PATH precedences + # 1. $CONDA_PREFIX, if defined. This follows the pytorch official build instructions. + # 2. /opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}, if ANACONDA_PYTHON_VERSION defined. + # This is for CI, which defines ANACONDA_PYTHON_VERSION but not CONDA_PREFIX. + # 3. $(conda info --base). The fallback value of pytorch official build + # instructions actually refers to this. + # Commonly this is /opt/conda/ + if [[ -v CONDA_PREFIX ]]; then + export CMAKE_PREFIX_PATH=${CONDA_PREFIX} + elif [[ -v ANACONDA_PYTHON_VERSION ]]; then + export CMAKE_PREFIX_PATH="/opt/conda/envs/py_${ANACONDA_PYTHON_VERSION}" + else + # already checked by `! which conda` + CMAKE_PREFIX_PATH="$(conda info --base)" + export CMAKE_PREFIX_PATH + fi + + # Workaround required for MKL library linkage + # https://github.com/pytorch/pytorch/issues/119557 + if [ "$ANACONDA_PYTHON_VERSION" = "3.12" ]; then + export CMAKE_LIBRARY_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/lib/" + export CMAKE_INCLUDE_PATH="/opt/conda/envs/py_$ANACONDA_PYTHON_VERSION/include/" + fi +fi + +if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then + export USE_MKLDNN=1 + export USE_MKLDNN_ACL=1 + export ACL_ROOT_DIR=/ComputeLibrary fi if [[ "$BUILD_ENVIRONMENT" == *libtorch* ]]; then @@ -210,6 +238,24 @@ if [[ "${BUILD_ENVIRONMENT}" != *android* && "${BUILD_ENVIRONMENT}" != *cuda* ]] export BUILD_STATIC_RUNTIME_BENCHMARK=ON fi +# Do not change workspace permissions for ROCm CI jobs +# as it can leave workspace with bad permissions for cancelled jobs +if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then + # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) + WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") + cleanup_workspace() { + echo "sudo may print the following warning message that can be ignored. The chown command will still run." + echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted" + echo "For more details refer to https://github.com/sudo-project/sudo/issues/42" + sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace + } + # Disable shellcheck SC2064 as we want to parse the original owner immediately. + # shellcheck disable=SC2064 + trap_add cleanup_workspace EXIT + sudo chown -R jenkins /var/lib/jenkins/workspace + git config --global --add safe.directory /var/lib/jenkins/workspace +fi + if [[ "$BUILD_ENVIRONMENT" == *-bazel-* ]]; then set -e @@ -235,13 +281,17 @@ else ( ! get_exit_code python setup.py clean bad_argument ) if [[ "$BUILD_ENVIRONMENT" != *libtorch* ]]; then - # rocm builds fail when WERROR=1 # XLA test build fails when WERROR=1 # set only when building other architectures # or building non-XLA tests. if [[ "$BUILD_ENVIRONMENT" != *rocm* && "$BUILD_ENVIRONMENT" != *xla* ]]; then + if [[ "$BUILD_ENVIRONMENT" != *py3.8* ]]; then + # Install numpy-2.0 release candidate for builds + # Which should be backward compatible with Numpy-1.X + python -mpip install --pre numpy==2.0.0rc1 + fi WERROR=1 python setup.py bdist_wheel else python setup.py bdist_wheel @@ -341,4 +391,8 @@ if [[ "$BUILD_ENVIRONMENT" != *libtorch* && "$BUILD_ENVIRONMENT" != *bazel* ]]; python tools/stats/export_test_times.py fi -print_sccache_stats +# snadampal: skipping it till sccache support added for aarch64 +# https://github.com/pytorch/pytorch/issues/121559 +if [[ "$BUILD_ENVIRONMENT" != *aarch64* ]]; then + print_sccache_stats +fi diff --git a/.ci/pytorch/common_utils.sh b/.ci/pytorch/common_utils.sh index fa46391552217..51297f7bfff88 100644 --- a/.ci/pytorch/common_utils.sh +++ b/.ci/pytorch/common_utils.sh @@ -158,6 +158,11 @@ function install_torchvision() { fi } +function install_tlparse() { + pip_install --user "tlparse==0.3.7" + PATH="$(python -m site --user-base)/bin:$PATH" +} + function install_torchrec_and_fbgemm() { local torchrec_commit torchrec_commit=$(get_pinned_commit torchrec) diff --git a/.ci/pytorch/macos-common.sh b/.ci/pytorch/macos-common.sh index eef066b4dc9b5..1c7bc103673de 100755 --- a/.ci/pytorch/macos-common.sh +++ b/.ci/pytorch/macos-common.sh @@ -9,7 +9,7 @@ sysctl -a | grep machdep.cpu # These are required for both the build job and the test job. # In the latter to test cpp extensions. -export MACOSX_DEPLOYMENT_TARGET=11.0 +export MACOSX_DEPLOYMENT_TARGET=11.1 export CXX=clang++ export CC=clang diff --git a/.ci/pytorch/macos-test.sh b/.ci/pytorch/macos-test.sh index 739d0ba3357fe..a54b8c360eba5 100755 --- a/.ci/pytorch/macos-test.sh +++ b/.ci/pytorch/macos-test.sh @@ -149,6 +149,8 @@ test_jit_hooks() { assert_git_not_dirty } +install_tlparse + if [[ $NUM_TEST_SHARDS -gt 1 ]]; then test_python_shard "${SHARD_NUMBER}" if [[ "${SHARD_NUMBER}" == 1 ]]; then diff --git a/.ci/pytorch/multigpu-test.sh b/.ci/pytorch/multigpu-test.sh index 70ae4d2974e8c..7e04e92919cb7 100755 --- a/.ci/pytorch/multigpu-test.sh +++ b/.ci/pytorch/multigpu-test.sh @@ -34,7 +34,6 @@ time python test/run_test.py --verbose -i distributed/_shard/sharded_tensor/test # functional collective tests time python test/run_test.py --verbose -i distributed/test_functional_api - # DTensor tests time python test/run_test.py --verbose -i distributed/_tensor/test_random_ops time python test/run_test.py --verbose -i distributed/_tensor/test_dtensor_compile @@ -46,9 +45,14 @@ time python test/run_test.py --verbose -i distributed/test_device_mesh time python test/run_test.py --verbose -i distributed/tensor/parallel/test_ddp_2d_parallel time python test/run_test.py --verbose -i distributed/tensor/parallel/test_fsdp_2d_parallel time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_examples +time python test/run_test.py --verbose -i distributed/tensor/parallel/test_tp_random_state + +# FSDP2 tests +time python test/run_test.py --verbose -i distributed/_composable/fsdp/test_fully_shard_training -- -k test_2d_mlp_with_nd_mesh # Other tests time python test/run_test.py --verbose -i test_cuda_primary_ctx -time python test/run_test.py --verbose -i test_optim -- -k optimizers_with_varying_tensors +time python test/run_test.py --verbose -i test_optim -- -k test_forloop_goes_right_direction_multigpu +time python test/run_test.py --verbose -i test_optim -- -k test_mixed_device_dtype time python test/run_test.py --verbose -i test_foreach -- -k test_tensors_grouping assert_git_not_dirty diff --git a/.ci/pytorch/perf_test/compare_with_baseline.py b/.ci/pytorch/perf_test/compare_with_baseline.py index 49b77cbba2a5d..caf9e993bd29c 100644 --- a/.ci/pytorch/perf_test/compare_with_baseline.py +++ b/.ci/pytorch/perf_test/compare_with_baseline.py @@ -59,16 +59,16 @@ print("sample sigma: ", sample_sigma) if math.isnan(sample_mean): - raise Exception("""Error: sample mean is NaN""") + raise Exception("""Error: sample mean is NaN""") # noqa: TRY002 elif math.isnan(sample_sigma): - raise Exception("""Error: sample sigma is NaN""") + raise Exception("""Error: sample sigma is NaN""") # noqa: TRY002 z_value = (sample_mean - mean) / sigma print("z-value: ", z_value) if z_value >= 3: - raise Exception( + raise Exception( # noqa: TRY002 f"""\n z-value >= 3, there is high chance of perf regression.\n To reproduce this regression, run diff --git a/.ci/pytorch/python_doc_push_script.sh b/.ci/pytorch/python_doc_push_script.sh index 86c2037b12868..d4076d3469e9f 100755 --- a/.ci/pytorch/python_doc_push_script.sh +++ b/.ci/pytorch/python_doc_push_script.sh @@ -26,8 +26,8 @@ echo "error: python_doc_push_script.sh: version (arg2) not specified" fi # Argument 1: Where to copy the built documentation to -# (pytorch.github.io/$install_path) -install_path="${1:-${DOCS_INSTALL_PATH:-docs/${DOCS_VERSION}}}" +# (pytorch_docs/$install_path) +install_path="${1:-${DOCS_INSTALL_PATH:-${DOCS_VERSION}}}" if [ -z "$install_path" ]; then echo "error: python_doc_push_script.sh: install_path (arg1) not specified" exit 1 @@ -68,8 +68,8 @@ build_docs () { } -git clone https://github.com/pytorch/pytorch.github.io -b "$branch" --depth 1 -pushd pytorch.github.io +git clone https://github.com/pytorch/docs pytorch_docs -b "$branch" --depth 1 +pushd pytorch_docs export LC_ALL=C export PATH=/opt/conda/bin:$PATH @@ -105,6 +105,7 @@ if [ "$is_main_doc" = true ]; then echo undocumented objects found: cat build/coverage/python.txt echo "Make sure you've updated relevant .rsts in docs/source!" + echo "You can reproduce locally by running 'cd docs && make coverage && cat build/coverage/python.txt'" exit 1 fi else diff --git a/.ci/pytorch/test.sh b/.ci/pytorch/test.sh index e15fc73cf7320..a22bebc166792 100755 --- a/.ci/pytorch/test.sh +++ b/.ci/pytorch/test.sh @@ -6,6 +6,27 @@ set -ex +# shellcheck source=./common.sh +source "$(dirname "${BASH_SOURCE[0]}")/common.sh" + +# Do not change workspace permissions for ROCm CI jobs +# as it can leave workspace with bad permissions for cancelled jobs +if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then + # Workaround for dind-rootless userid mapping (https://github.com/pytorch/ci-infra/issues/96) + WORKSPACE_ORIGINAL_OWNER_ID=$(stat -c '%u' "/var/lib/jenkins/workspace") + cleanup_workspace() { + echo "sudo may print the following warning message that can be ignored. The chown command will still run." + echo " sudo: setrlimit(RLIMIT_STACK): Operation not permitted" + echo "For more details refer to https://github.com/sudo-project/sudo/issues/42" + sudo chown -R "$WORKSPACE_ORIGINAL_OWNER_ID" /var/lib/jenkins/workspace + } + # Disable shellcheck SC2064 as we want to parse the original owner immediately. + # shellcheck disable=SC2064 + trap_add cleanup_workspace EXIT + sudo chown -R jenkins /var/lib/jenkins/workspace + git config --global --add safe.directory /var/lib/jenkins/workspace +fi + echo "Environment variables:" env @@ -90,9 +111,6 @@ if [[ -n $TESTS_TO_INCLUDE ]]; then INCLUDE_CLAUSE="--include $TESTS_TO_INCLUDE" fi -# shellcheck source=./common.sh -source "$(dirname "${BASH_SOURCE[0]}")/common.sh" - echo "Environment variables" env @@ -130,6 +148,8 @@ if [[ "$BUILD_ENVIRONMENT" == *cuda* || "$BUILD_ENVIRONMENT" == *rocm* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="cuda" elif [[ "$BUILD_ENVIRONMENT" == *xpu* ]]; then export PYTORCH_TESTING_DEVICE_ONLY_FOR="xpu" + # setting PYTHON_TEST_EXTRA_OPTION + export PYTHON_TEST_EXTRA_OPTION="--xpu" fi if [[ "$TEST_CONFIG" == *crossref* ]]; then @@ -137,6 +157,8 @@ if [[ "$TEST_CONFIG" == *crossref* ]]; then fi if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then + # regression in ROCm 6.0 on MI50 CI runners due to hipblaslt; remove in 6.1 + export VALGRIND=OFF # Print GPU info rocminfo rocminfo | grep -E 'Name:.*\sgfx|Marketing' @@ -159,6 +181,13 @@ if [[ "$BUILD_ENVIRONMENT" != *-bazel-* ]] ; then export PATH="$HOME/.local/bin:$PATH" fi +if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then + # TODO: revisit this once the CI is stabilized on aarch64 linux + export VALGRIND=OFF +fi + +install_tlparse + # DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems # if you're not careful. Check this if you made some changes and the # ASAN test is not working @@ -205,8 +234,6 @@ if [[ "$BUILD_ENVIRONMENT" == *asan* ]]; then export LD_PRELOAD=/usr/lib/llvm-15/lib/clang/15.0.7/lib/linux/libclang_rt.asan-x86_64.so # Disable valgrind for asan export VALGRIND=OFF - # Increase stack size, because ASAN red zones use more stack - ulimit -s 81920 (cd test && python -c "import torch; print(torch.__version__, torch.version.git_version)") echo "The next four invocations are expected to crash; if they don't that means ASAN/UBSAN is misconfigured" @@ -250,14 +277,14 @@ test_python_shard() { # Bare --include flag is not supported and quoting for lint ends up with flag not being interpreted correctly # shellcheck disable=SC2086 - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --shard "$1" "$NUM_TEST_SHARDS" --verbose $PYTHON_TEST_EXTRA_OPTION assert_git_not_dirty } test_python() { # shellcheck disable=SC2086 - time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose + time python test/run_test.py --exclude-jit-executor --exclude-distributed-tests $INCLUDE_CLAUSE --verbose $PYTHON_TEST_EXTRA_OPTION assert_git_not_dirty } @@ -268,34 +295,13 @@ test_dynamo_shard() { exit 1 fi python tools/dynamo/verify_dynamo.py - # Temporarily disable test_fx for dynamo pending the investigation on TTS - # regression in https://github.com/pytorch/torchdynamo/issues/784 + # PLEASE DO NOT ADD ADDITIONAL EXCLUDES HERE. + # Instead, use @skipIfTorchDynamo on your tests. time python test/run_test.py --dynamo \ + --exclude-inductor-tests \ --exclude-jit-executor \ --exclude-distributed-tests \ - --exclude \ - test_ao_sparsity \ - test_autograd \ - test_jit \ - test_proxy_tensor \ - test_quantization \ - test_public_bindings \ - test_dataloader \ - test_reductions \ - test_namedtensor \ - test_namedtuple_return_api \ - profiler/test_profiler \ - profiler/test_profiler_tree \ - test_overrides \ - test_python_dispatch \ - test_fx \ - test_package \ - test_legacy_vmap \ - test_custom_ops \ - test_content_store \ - export/test_db \ - functorch/test_dims \ - functorch/test_aotdispatch \ + --exclude-torch-export-tests \ --shard "$1" "$NUM_TEST_SHARDS" \ --verbose assert_git_not_dirty @@ -304,11 +310,23 @@ test_dynamo_shard() { test_inductor_distributed() { # Smuggle a few multi-gpu tests here so that we don't have to request another large node echo "Testing multi_gpu tests in test_torchinductor" - pytest test/inductor/test_torchinductor.py -k test_multi_gpu - pytest test/inductor/test_aot_inductor.py -k test_non_default_cuda_device - pytest test/inductor/test_aot_inductor.py -k test_replicate_on_devices - pytest test/distributed/_tensor/test_dtensor_compile.py - pytest test/distributed/tensor/parallel/test_fsdp_2d_parallel.py + python test/run_test.py -i inductor/test_torchinductor.py -k test_multi_gpu --verbose + python test/run_test.py -i inductor/test_aot_inductor.py -k test_non_default_cuda_device --verbose + python test/run_test.py -i inductor/test_aot_inductor.py -k test_replicate_on_devices --verbose + python test/run_test.py -i distributed/test_c10d_functional_native.py --verbose + python test/run_test.py -i distributed/_tensor/test_dtensor_compile.py --verbose + python test/run_test.py -i distributed/tensor/parallel/test_fsdp_2d_parallel.py --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_comm.py --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_multi_group --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_with_activation_checkpointing --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_mlp --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_hsdp --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_train_parity_2d_transformer_checkpoint_resume --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_training.py -k test_gradient_accumulation --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_frozen.py --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_compute_dtype --verbose + python test/run_test.py -i distributed/_composable/fsdp/test_fully_shard_mixed_precision.py -k test_reduce_dtype --verbose + python test/run_test.py -i distributed/fsdp/test_fsdp_tp_integration.py -k test_fsdp_tp_integration --verbose # this runs on both single-gpu and multi-gpu instance. It should be smart about skipping tests that aren't supported # with if required # gpus aren't available @@ -320,16 +338,24 @@ test_inductor() { python tools/dynamo/verify_dynamo.py python test/run_test.py --inductor --include test_modules test_ops test_ops_gradients test_torch --verbose # Do not add --inductor for the following inductor unit tests, otherwise we will fail because of nested dynamo state - python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo --verbose + python test/run_test.py --include inductor/test_torchinductor inductor/test_torchinductor_opinfo inductor/test_aot_inductor --verbose # docker build uses bdist_wheel which does not work with test_aot_inductor # TODO: need a faster way to build if [[ "$BUILD_ENVIRONMENT" != *rocm* ]]; then BUILD_AOT_INDUCTOR_TEST=1 python setup.py develop - CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aot_inductor + CPP_TESTS_DIR="${BUILD_BIN_DIR}" LD_LIBRARY_PATH="${TORCH_LIB_DIR}" python test/run_test.py --cpp --verbose -i cpp/test_aoti_abi_check cpp/test_aoti_inference fi } +test_inductor_cpp_wrapper_abi_compatible() { + export TORCHINDUCTOR_ABI_COMPATIBLE=1 + echo "Testing Inductor cpp wrapper mode with TORCHINDUCTOR_ABI_COMPATIBLE=1" + # cpu stack allocation causes segfault and needs more investigation + TORCHINDUCTOR_STACK_ALLOCATION=0 python test/run_test.py --include inductor/test_cpu_cpp_wrapper + python test/run_test.py --include inductor/test_cuda_cpp_wrapper +} + # "Global" flags for inductor benchmarking controlled by TEST_CONFIG # For example 'dynamic_aot_eager_torchbench' TEST_CONFIG means we run # the benchmark script with '--dynamic-shapes --backend aot_eager --device cuda' @@ -422,7 +448,7 @@ test_perf_for_dashboard() { --output "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_freezing_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv" fi if [[ "$DASHBOARD_TAG" == *aotinductor-true* ]] && [[ "$mode" == "inference" ]]; then - python "benchmarks/dynamo/$suite.py" \ + TORCHINDUCTOR_ABI_COMPATIBLE=1 python "benchmarks/dynamo/$suite.py" \ "${target_flag[@]}" --"$mode" --"$dtype" --export-aot-inductor --disable-cudagraphs "$@" \ --output "$TEST_REPORTS_DIR/${backend}_aot_inductor_${suite}_${dtype}_${mode}_cuda_${target}.csv" fi @@ -431,6 +457,17 @@ test_perf_for_dashboard() { "${target_flag[@]}" --"$mode" --"$dtype" --backend "$backend" "$@" \ --output "$TEST_REPORTS_DIR/${backend}_max_autotune_${suite}_${dtype}_${mode}_cuda_${target}.csv" fi + if [[ "$DASHBOARD_TAG" == *cudagraphs_low_precision-true* ]] && [[ "$mode" == "inference" ]]; then + # TODO: This has a new dtype called quant and the benchmarks script needs to be updated to support this. + # The tentative command is as follows. It doesn't work now, but it's ok because we only need mock data + # to fill the dashboard. + python "benchmarks/dynamo/$suite.py" \ + "${target_flag[@]}" --"$mode" --quant --backend "$backend" "$@" \ + --output "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" || true + # Copy cudagraph results as mock data, easiest choice? + cp "$TEST_REPORTS_DIR/${backend}_with_cudagraphs_${suite}_${dtype}_${mode}_cuda_${target}.csv" \ + "$TEST_REPORTS_DIR/${backend}_cudagraphs_low_precision_${suite}_quant_${mode}_cuda_${target}.csv" + fi done done } @@ -466,6 +503,11 @@ test_single_dynamo_benchmark() { test_perf_for_dashboard "$suite" \ "${DYNAMO_BENCHMARK_FLAGS[@]}" "$@" "${partition_flags[@]}" else + if [[ "${TEST_CONFIG}" == *aot_inductor* ]]; then + # Test AOTInductor with the ABI-compatible mode on CI + # This can be removed once the ABI-compatible mode becomes default. + export TORCHINDUCTOR_ABI_COMPATIBLE=1 + fi python "benchmarks/dynamo/$suite.py" \ --ci --accuracy --timing --explain \ "${DYNAMO_BENCHMARK_FLAGS[@]}" \ @@ -480,6 +522,11 @@ test_single_dynamo_benchmark() { fi } +test_inductor_micro_benchmark() { + TEST_REPORTS_DIR=$(pwd)/test/test-micro-reports + python benchmarks/gpt_fast/benchmark.py +} + test_dynamo_benchmark() { # Usage: test_dynamo_benchmark huggingface 0 TEST_REPORTS_DIR=$(pwd)/test/test-reports @@ -522,7 +569,7 @@ test_inductor_torchbench_smoketest_perf() { # The threshold value needs to be actively maintained to make this check useful python benchmarks/dynamo/check_perf_csv.py -f "$TEST_REPORTS_DIR/inductor_training_smoketest.csv" -t 1.4 - python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \ + TORCHINDUCTOR_ABI_COMPATIBLE=1 python benchmarks/dynamo/torchbench.py --device cuda --performance --bfloat16 --inference \ --export-aot-inductor --only nanogpt --output "$TEST_REPORTS_DIR/inductor_inference_smoketest.csv" # The threshold value needs to be actively maintained to make this check useful # The perf number of nanogpt seems not very stable, e.g. @@ -543,6 +590,56 @@ test_inductor_torchbench_smoketest_perf() { done } +test_inductor_torchbench_cpu_smoketest_perf(){ + TEST_REPORTS_DIR=$(pwd)/test/test-reports + mkdir -p "$TEST_REPORTS_DIR" + + #set jemalloc + JEMALLOC_LIB="/usr/lib/x86_64-linux-gnu/libjemalloc.so.2" + IOMP_LIB="$(dirname "$(which python)")/../lib/libiomp5.so" + export LD_PRELOAD="$JEMALLOC_LIB":"$IOMP_LIB":"$LD_PRELOAD" + export MALLOC_CONF="oversize_threshold:1,background_thread:true,metadata_thp:auto,dirty_decay_ms:-1,muzzy_decay_ms:-1" + export KMP_AFFINITY=granularity=fine,compact,1,0 + export KMP_BLOCKTIME=1 + CORES=$(lscpu | grep Core | awk '{print $4}') + export OMP_NUM_THREADS=$CORES + end_core=$(( CORES-1 )) + + MODELS_SPEEDUP_TARGET=benchmarks/dynamo/expected_ci_speedup_inductor_torchbench_cpu.csv + + grep -v '^ *#' < "$MODELS_SPEEDUP_TARGET" | while IFS=',' read -r -a model_cfg + do + local model_name=${model_cfg[0]} + local data_type=${model_cfg[1]} + local speedup_target=${model_cfg[4]} + if [[ ${model_cfg[3]} == "cpp" ]]; then + export TORCHINDUCTOR_CPP_WRAPPER=1 + else + unset TORCHINDUCTOR_CPP_WRAPPER + fi + local output_name="$TEST_REPORTS_DIR/inductor_inference_${model_cfg[0]}_${model_cfg[1]}_${model_cfg[2]}_${model_cfg[3]}_cpu_smoketest.csv" + + if [[ ${model_cfg[2]} == "dynamic" ]]; then + taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \ + --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" --dynamic-shapes \ + --dynamic-batch-only --freezing --timeout 9000 --backend=inductor --output "$output_name" + else + taskset -c 0-"$end_core" python benchmarks/dynamo/torchbench.py \ + --inference --performance --"$data_type" -dcpu -n50 --only "$model_name" \ + --freezing --timeout 9000 --backend=inductor --output "$output_name" + fi + cat "$output_name" + # The threshold value needs to be actively maintained to make this check useful. + python benchmarks/dynamo/check_perf_csv.py -f "$output_name" -t "$speedup_target" + done +} + +test_torchbench_gcp_smoketest(){ + pushd "${TORCHBENCHPATH}" + python test.py -v + popd +} + test_python_gloo_with_tls() { source "$(dirname "${BASH_SOURCE[0]}")/run_glootls_test.sh" assert_git_not_dirty @@ -693,9 +790,8 @@ test_xpu_bin(){ TEST_REPORTS_DIR=$(pwd)/test/test-reports mkdir -p "$TEST_REPORTS_DIR" - for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}* - do - if [[ "$xpu_case" != *"*"* ]]; then + for xpu_case in "${BUILD_BIN_DIR}"/*{xpu,sycl}*; do + if [[ "$xpu_case" != *"*"* && "$xpu_case" != *.so && "$xpu_case" != *.a ]]; then case_name=$(basename "$xpu_case") echo "Testing ${case_name} ..." "$xpu_case" --gtest_output=xml:"$TEST_REPORTS_DIR"/"$case_name".xml @@ -943,7 +1039,8 @@ test_bazel() { tools/bazel test --config=cpu-only --test_timeout=480 --test_output=all --test_tag_filters=-gpu-required --test_filter=-*CUDA :all_tests else - tools/bazel test --test_output=errors \ + # Increase the test timeout to 480 like CPU tests because modules_test frequently timeout + tools/bazel test --test_timeout=480 --test_output=errors \ //:any_test \ //:autograd_test \ //:dataloader_test \ @@ -1038,14 +1135,17 @@ test_docs_test() { } test_executorch() { + echo "Install torchvision and torchaudio" + install_torchvision + install_torchaudio + pushd /executorch - echo "Install torchvision and torchaudio" - # TODO(huydhn): Switch this to the pinned commits on ExecuTorch once they are - # there. These libraries need to be built here, and not part of the Docker - # image because they require the target version of torch to be installed first - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/audio.git" - pip_install --no-use-pep517 --user "git+https://github.com/pytorch/vision.git" + # NB: We need to build ExecuTorch runner here and not inside the Docker image + # because it depends on PyTorch + # shellcheck disable=SC1091 + source .ci/scripts/utils.sh + build_executorch_runner "cmake" echo "Run ExecuTorch regression tests for some models" # NB: This is a sample model, more can be added here @@ -1063,11 +1163,33 @@ test_executorch() { assert_git_not_dirty } +test_linux_aarch64(){ + python test/run_test.py --include test_modules test_mkldnn test_mkldnn_fusion test_openmp test_torch test_dynamic_shapes \ + test_transformers test_multiprocessing test_numpy_interop --verbose + + # Dynamo tests + python test/run_test.py --include dynamo/test_compile dynamo/test_backends dynamo/test_comptime dynamo/test_config \ + dynamo/test_functions dynamo/test_fx_passes_pre_grad dynamo/test_interop dynamo/test_model_output dynamo/test_modules \ + dynamo/test_optimizers dynamo/test_recompile_ux dynamo/test_recompiles --verbose + + # Inductor tests + python test/run_test.py --include inductor/test_torchinductor inductor/test_benchmark_fusion inductor/test_codecache \ + inductor/test_config inductor/test_control_flow inductor/test_coordinate_descent_tuner inductor/test_fx_fusion \ + inductor/test_group_batch_fusion inductor/test_inductor_freezing inductor/test_inductor_utils \ + inductor/test_inplacing_pass inductor/test_kernel_benchmark inductor/test_layout_optim \ + inductor/test_max_autotune inductor/test_memory_planning inductor/test_metrics inductor/test_multi_kernel inductor/test_pad_mm \ + inductor/test_pattern_matcher inductor/test_perf inductor/test_profiler inductor/test_select_algorithm inductor/test_smoke \ + inductor/test_split_cat_fx_passes inductor/test_standalone_compile inductor/test_torchinductor \ + inductor/test_torchinductor_codegen_dynamic_shapes inductor/test_torchinductor_dynamic_shapes --verbose +} + if ! [[ "${BUILD_ENVIRONMENT}" == *libtorch* || "${BUILD_ENVIRONMENT}" == *-bazel-* ]]; then (cd test && python -c "import torch; print(torch.__config__.show())") (cd test && python -c "import torch; print(torch.__config__.parallel_info())") fi -if [[ "${TEST_CONFIG}" == *backward* ]]; then +if [[ "$BUILD_ENVIRONMENT" == *aarch64* ]]; then + test_linux_aarch64 +elif [[ "${TEST_CONFIG}" == *backward* ]]; then test_forward_backward_compatibility # Do NOT add tests after bc check tests, see its comment. elif [[ "${TEST_CONFIG}" == *xla* ]]; then @@ -1092,6 +1214,8 @@ elif [[ "$TEST_CONFIG" == deploy ]]; then test_torch_deploy elif [[ "${TEST_CONFIG}" == *inductor_distributed* ]]; then test_inductor_distributed +elif [[ "${TEST_CONFIG}" == *inductor-micro-benchmark* ]]; then + test_inductor_micro_benchmark elif [[ "${TEST_CONFIG}" == *huggingface* ]]; then install_torchvision id=$((SHARD_NUMBER-1)) @@ -1114,6 +1238,14 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then if [[ "${TEST_CONFIG}" == *inductor_torchbench_smoketest_perf* ]]; then checkout_install_torchbench hf_Bert hf_Albert nanogpt timm_vision_transformer PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_smoketest_perf + elif [[ "${TEST_CONFIG}" == *inductor_torchbench_cpu_smoketest_perf* ]]; then + checkout_install_torchbench timm_vision_transformer phlippe_densenet basic_gnn_gcn \ + llama_v2_7b_16h resnet50 timm_efficientnet mobilenet_v3_large timm_resnest \ + shufflenet_v2_x1_0 hf_GPT2 + PYTHONPATH=$(pwd)/torchbench test_inductor_torchbench_cpu_smoketest_perf + elif [[ "${TEST_CONFIG}" == *torchbench_gcp_smoketest* ]]; then + checkout_install_torchbench + TORCHBENCHPATH=$(pwd)/torchbench test_torchbench_gcp_smoketest else checkout_install_torchbench # Do this after checkout_install_torchbench to ensure we clobber any @@ -1123,6 +1255,9 @@ elif [[ "${TEST_CONFIG}" == *torchbench* ]]; then fi PYTHONPATH=$(pwd)/torchbench test_dynamo_benchmark torchbench "$id" fi +elif [[ "${TEST_CONFIG}" == *inductor_cpp_wrapper_abi_compatible* ]]; then + install_torchvision + test_inductor_cpp_wrapper_abi_compatible elif [[ "${TEST_CONFIG}" == *inductor* && "${SHARD_NUMBER}" == 1 ]]; then install_torchvision test_inductor diff --git a/.ci/pytorch/win-test-helpers/build_pytorch.bat b/.ci/pytorch/win-test-helpers/build_pytorch.bat index 070e7a14687ee..28bd083f984ab 100644 --- a/.ci/pytorch/win-test-helpers/build_pytorch.bat +++ b/.ci/pytorch/win-test-helpers/build_pytorch.bat @@ -16,24 +16,23 @@ set PATH=C:\Program Files\CMake\bin;C:\Program Files\7-Zip;C:\ProgramData\chocol set INSTALLER_DIR=%SCRIPT_HELPERS_DIR%\installation-helpers - -call %INSTALLER_DIR%\install_mkl.bat -if errorlevel 1 exit /b -if not errorlevel 0 exit /b - call %INSTALLER_DIR%\install_magma.bat -if errorlevel 1 exit /b -if not errorlevel 0 exit /b +if errorlevel 1 goto fail +if not errorlevel 0 goto fail call %INSTALLER_DIR%\install_sccache.bat -if errorlevel 1 exit /b -if not errorlevel 0 exit /b +if errorlevel 1 goto fail +if not errorlevel 0 goto fail :: Miniconda has been installed as part of the Windows AMI with all the dependencies. :: We just need to activate it here call %INSTALLER_DIR%\activate_miniconda3.bat -if errorlevel 1 exit /b -if not errorlevel 0 exit /b +if errorlevel 1 goto fail +if not errorlevel 0 goto fail + +call pip install mkl-include==2021.4.0 mkl-devel==2021.4.0 +if errorlevel 1 goto fail +if not errorlevel 0 goto fail :: Override VS env here pushd . @@ -42,8 +41,8 @@ if "%VC_VERSION%" == "" ( ) else ( call "C:\Program Files (x86)\Microsoft Visual Studio\%VC_YEAR%\%VC_PRODUCT%\VC\Auxiliary\Build\vcvarsall.bat" x64 -vcvars_ver=%VC_VERSION% ) -if errorlevel 1 exit /b -if not errorlevel 0 exit /b +if errorlevel 1 goto fail +if not errorlevel 0 goto fail @echo on popd @@ -53,12 +52,12 @@ set CUDA_PATH=C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v%CUDA_VERSION% if x%CUDA_VERSION:.=%==x%CUDA_VERSION% ( echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.' - exit /b 1 + goto fail ) rem version transformer, for example 10.1 to 10_1. if x%CUDA_VERSION:.=%==x%CUDA_VERSION% ( echo CUDA version %CUDA_VERSION% format isn't correct, which doesn't contain '.' - exit /b 1 + goto fail ) set VERSION_SUFFIX=%CUDA_VERSION:.=_% set CUDA_PATH_V%VERSION_SUFFIX%=%CUDA_PATH% @@ -89,8 +88,8 @@ set SCCACHE_IGNORE_SERVER_IO_ERROR=1 sccache --stop-server sccache --start-server sccache --zero-stats -set CC=sccache-cl -set CXX=sccache-cl +set CMAKE_C_COMPILER_LAUNCHER=sccache +set CMAKE_CXX_COMPILER_LAUNCHER=sccache set CMAKE_GENERATOR=Ninja @@ -102,8 +101,8 @@ if "%USE_CUDA%"=="1" ( :: CMake requires a single command as CUDA_NVCC_EXECUTABLE, so we push the wrappers :: randomtemp.exe and sccache.exe into a batch file which CMake invokes. curl -kL https://github.com/peterjc123/randomtemp-rust/releases/download/v0.4/randomtemp.exe --output %TMP_DIR_WIN%\bin\randomtemp.exe - if errorlevel 1 exit /b - if not errorlevel 0 exit /b + if errorlevel 1 goto fail + if not errorlevel 0 goto fail echo @"%TMP_DIR_WIN%\bin\randomtemp.exe" "%TMP_DIR_WIN%\bin\sccache.exe" "%CUDA_PATH%\bin\nvcc.exe" %%* > "%TMP_DIR%/bin/nvcc.bat" cat %TMP_DIR%/bin/nvcc.bat set CUDA_NVCC_EXECUTABLE=%TMP_DIR%/bin/nvcc.bat @@ -115,8 +114,8 @@ if "%USE_CUDA%"=="1" ( set python setup.py bdist_wheel -if errorlevel 1 exit /b -if not errorlevel 0 exit /b +if errorlevel 1 goto fail +if not errorlevel 0 goto fail sccache --show-stats python -c "import os, glob; os.system('python -mpip install --no-index --no-deps ' + glob.glob('dist/*.whl')[0])" ( @@ -136,3 +135,8 @@ python -c "import os, glob; os.system('python -mpip install --no-index --no-deps sccache --show-stats --stats-format json | jq .stats > sccache-stats-%BUILD_ENVIRONMENT%-%OUR_GITHUB_JOB_ID%.json sccache --stop-server + +exit /b 0 + +:fail +exit /b 1 diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat deleted file mode 100644 index 6c676d1baeded..0000000000000 --- a/.ci/pytorch/win-test-helpers/installation-helpers/install_mkl.bat +++ /dev/null @@ -1,14 +0,0 @@ -if "%REBUILD%"=="" ( - if "%BUILD_ENVIRONMENT%"=="" ( - curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/mkl_2020.2.254.7z --output %TMP_DIR_WIN%\mkl.7z - ) else ( - aws s3 cp s3://ossci-windows/mkl_2020.2.254.7z %TMP_DIR_WIN%\mkl.7z --quiet - ) - if errorlevel 1 exit /b - if not errorlevel 0 exit /b - 7z x -aoa %TMP_DIR_WIN%\mkl.7z -o%TMP_DIR_WIN%\mkl - if errorlevel 1 exit /b - if not errorlevel 0 exit /b -) -set CMAKE_INCLUDE_PATH=%TMP_DIR_WIN%\mkl\include -set LIB=%TMP_DIR_WIN%\mkl\lib;%LIB% diff --git a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat index 6f8cc15ba8684..7989f7c6ece3f 100644 --- a/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat +++ b/.ci/pytorch/win-test-helpers/installation-helpers/install_sccache.bat @@ -1,18 +1,13 @@ mkdir %TMP_DIR_WIN%\bin if "%REBUILD%"=="" ( - :check_sccache - %TMP_DIR_WIN%\bin\sccache.exe --show-stats || ( + IF EXIST %TMP_DIR_WIN%\bin\sccache.exe ( taskkill /im sccache.exe /f /t || ver > nul del %TMP_DIR_WIN%\bin\sccache.exe || ver > nul - del %TMP_DIR_WIN%\bin\sccache-cl.exe || ver > nul - if "%BUILD_ENVIRONMENT%"=="" ( - curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache.exe --output %TMP_DIR_WIN%\bin\sccache.exe - curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-cl.exe --output %TMP_DIR_WIN%\bin\sccache-cl.exe - ) else ( - aws s3 cp s3://ossci-windows/sccache.exe %TMP_DIR_WIN%\bin\sccache.exe - aws s3 cp s3://ossci-windows/sccache-cl.exe %TMP_DIR_WIN%\bin\sccache-cl.exe - ) - goto :check_sccache ) -) + if "%BUILD_ENVIRONMENT%"=="" ( + curl --retry 3 --retry-all-errors -k https://s3.amazonaws.com/ossci-windows/sccache-v0.7.4.exe --output %TMP_DIR_WIN%\bin\sccache.exe + ) else ( + aws s3 cp s3://ossci-windows/sccache-v0.7.4.exe %TMP_DIR_WIN%\bin\sccache.exe + ) +) \ No newline at end of file diff --git a/.circleci/README.md b/.circleci/README.md index 569f58a1242e6..24dde8b47666f 100644 --- a/.circleci/README.md +++ b/.circleci/README.md @@ -1,468 +1,4 @@ Warning ======= -Contents may be out of date. Our CircleCI workflows are gradually being migrated to Github actions. - -Structure of CI -=============== - -setup job: -1. Does a git checkout -2. Persists CircleCI scripts (everything in `.circleci`) into a workspace. Why? - We don't always do a Git checkout on all subjobs, but we usually - still want to be able to call scripts one way or another in a subjob. - Persisting files this way lets us have access to them without doing a - checkout. This workspace is conventionally mounted on `~/workspace` - (this is distinguished from `~/project`, which is the conventional - working directory that CircleCI will default to starting your jobs - in.) -3. Write out the commit message to `.circleci/COMMIT_MSG`. This is so - we can determine in subjobs if we should actually run the jobs or - not, even if there isn't a Git checkout. - - -CircleCI configuration generator -================================ - -One may no longer make changes to the `.circleci/config.yml` file directly. -Instead, one must edit these Python scripts or files in the `verbatim-sources/` directory. - - -Usage ----------- - -1. Make changes to these scripts. -2. Run the `regenerate.sh` script in this directory and commit the script changes and the resulting change to `config.yml`. - -You'll see a build failure on GitHub if the scripts don't agree with the checked-in version. - - -Motivation ----------- - -These scripts establish a single, authoritative source of documentation for the CircleCI configuration matrix. -The documentation, in the form of diagrams, is automatically generated and cannot drift out of sync with the YAML content. - -Furthermore, consistency is enforced within the YAML config itself, by using a single source of data to generate -multiple parts of the file. - -* Facilitates one-off culling/enabling of CI configs for testing PRs on special targets - -Also see https://github.com/pytorch/pytorch/issues/17038 - - -Future direction ----------------- - -### Declaring sparse config subsets -See comment [here](https://github.com/pytorch/pytorch/pull/17323#pullrequestreview-206945747): - -In contrast with a full recursive tree traversal of configuration dimensions, -> in the future I think we actually want to decrease our matrix somewhat and have only a few mostly-orthogonal builds that taste as many different features as possible on PRs, plus a more complete suite on every PR and maybe an almost full suite nightly/weekly (we don't have this yet). Specifying PR jobs in the future might be easier to read with an explicit list when we come to this. ----------------- ----------------- - -# How do the binaries / nightlies / releases work? - -### What is a binary? - -A binary or package (used interchangeably) is a pre-built collection of c++ libraries, header files, python bits, and other files. We build these and distribute them so that users do not need to install from source. - -A **binary configuration** is a collection of - -* release or nightly - * releases are stable, nightlies are beta and built every night -* python version - * linux: 3.7m (mu is wide unicode or something like that. It usually doesn't matter but you should know that it exists) - * macos: 3.7, 3.8 - * windows: 3.7, 3.8 -* cpu version - * cpu, cuda 9.0, cuda 10.0 - * The supported cuda versions occasionally change -* operating system - * Linux - these are all built on CentOS. There haven't been any problems in the past building on CentOS and using on Ubuntu - * MacOS - * Windows - these are built on Azure pipelines -* devtoolset version (gcc compiler version) - * This only matters on Linux cause only Linux uses gcc. tldr is gcc made a backwards incompatible change from gcc 4.8 to gcc 5, because it had to change how it implemented std::vector and std::string - -### Where are the binaries? - -The binaries are built in CircleCI. There are nightly binaries built every night at 9pm PST (midnight EST) and release binaries corresponding to Pytorch releases, usually every few months. - -We have 3 types of binary packages - -* pip packages - nightlies are stored on s3 (pip install -f \). releases are stored in a pip repo (pip install torch) (ask Soumith about this) -* conda packages - nightlies and releases are both stored in a conda repo. Nighty packages have a '_nightly' suffix -* libtorch packages - these are zips of all the c++ libraries, header files, and sometimes dependencies. These are c++ only - * shared with dependencies (the only supported option for Windows) - * static with dependencies - * shared without dependencies - * static without dependencies - -All binaries are built in CircleCI workflows except Windows. There are checked-in workflows (committed into the .circleci/config.yml) to build the nightlies every night. Releases are built by manually pushing a PR that builds the suite of release binaries (overwrite the config.yml to build the release) - -# CircleCI structure of the binaries - -Some quick vocab: - -* A \**workflow** is a CircleCI concept; it is a DAG of '**jobs**'. ctrl-f 'workflows' on https://github.com/pytorch/pytorch/blob/main/.circleci/config.yml to see the workflows. -* **jobs** are a sequence of '**steps**' -* **steps** are usually just a bash script or a builtin CircleCI command. *All steps run in new environments, environment variables declared in one script DO NOT persist to following steps* -* CircleCI has a **workspace**, which is essentially a cache between steps of the *same job* in which you can store artifacts between steps. - -## How are the workflows structured? - -The nightly binaries have 3 workflows. We have one job (actually 3 jobs: build, test, and upload) per binary configuration - -1. binary_builds - 1. every day midnight EST - 2. linux: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml - 3. macos: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml - 4. For each binary configuration, e.g. linux_conda_3.7_cpu there is a - 1. binary_linux_conda_3.7_cpu_build - 1. Builds the build. On linux jobs this uses the 'docker executor'. - 2. Persists the package to the workspace - 2. binary_linux_conda_3.7_cpu_test - 1. Loads the package to the workspace - 2. Spins up a docker image (on Linux), mapping the package and code repos into the docker - 3. Runs some smoke tests in the docker - 4. (Actually, for macos this is a step rather than a separate job) - 3. binary_linux_conda_3.7_cpu_upload - 1. Logs in to aws/conda - 2. Uploads the package -2. update_s3_htmls - 1. every day 5am EST - 2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml - 3. See below for what these are for and why they're needed - 4. Three jobs that each examine the current contents of aws and the conda repo and update some html files in s3 -3. binarysmoketests - 1. every day - 2. https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml - 3. For each binary configuration, e.g. linux_conda_3.7_cpu there is a - 1. smoke_linux_conda_3.7_cpu - 1. Downloads the package from the cloud, e.g. using the official pip or conda instructions - 2. Runs the smoke tests - -## How are the jobs structured? - -The jobs are in https://github.com/pytorch/pytorch/tree/main/.circleci/verbatim-sources. Jobs are made of multiple steps. There are some shared steps used by all the binaries/smokes. Steps of these jobs are all delegated to scripts in https://github.com/pytorch/pytorch/tree/main/.circleci/scripts . - -* Linux jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/linux-binary-build-defaults.yml - * binary_linux_build.sh - * binary_linux_test.sh - * binary_linux_upload.sh -* MacOS jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/macos-binary-build-defaults.yml - * binary_macos_build.sh - * binary_macos_test.sh - * binary_macos_upload.sh -* Update html jobs: https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/binary_update_htmls.yml - * These delegate from the pytorch/builder repo - * https://github.com/pytorch/builder/blob/main/cron/update_s3_htmls.sh - * https://github.com/pytorch/builder/blob/main/cron/upload_binary_sizes.sh -* Smoke jobs (both linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-build-smoke-tests-defaults.yml - * These delegate from the pytorch/builder repo - * https://github.com/pytorch/builder/blob/main/run_tests.sh - * https://github.com/pytorch/builder/blob/main/smoke_test.sh - * https://github.com/pytorch/builder/blob/main/check_binary.sh -* Common shared code (shared across linux and macos): https://github.com/pytorch/pytorch/blob/main/.circleci/verbatim-sources/nightly-binary-build-defaults.yml - * binary_checkout.sh - checks out pytorch/builder repo. Right now this also checks out pytorch/pytorch, but it shouldn't. pytorch/pytorch should just be shared through the workspace. This can handle being run before binary_populate_env.sh - * binary_populate_env.sh - parses BUILD_ENVIRONMENT into the separate env variables that make up a binary configuration. Also sets lots of default values, the date, the version strings, the location of folders in s3, all sorts of things. This generally has to be run before other steps. - * binary_install_miniconda.sh - Installs miniconda, cross platform. Also hacks this for the update_binary_sizes job that doesn't have the right env variables - * binary_run_in_docker.sh - Takes a bash script file (the actual test code) from a hardcoded location, spins up a docker image, and runs the script inside the docker image - -### **Why do the steps all refer to scripts?** - -CircleCI creates a final yaml file by inlining every <<* segment, so if we were to keep all the code in the config.yml itself then the config size would go over 4 MB and cause infra problems. - -### **What is binary_run_in_docker for?** - -So, CircleCI has several executor types: macos, machine, and docker are the ones we use. The 'machine' executor gives you two cores on some linux vm. The 'docker' executor gives you considerably more cores (nproc was 32 instead of 2 back when I tried in February). Since the dockers are faster, we try to run everything that we can in dockers. Thus - -* linux build jobs use the docker executor. Running them on the docker executor was at least 2x faster than running them on the machine executor -* linux test jobs use the machine executor in order for them to properly interface with GPUs since docker executors cannot execute with attached GPUs -* linux upload jobs use the machine executor. The upload jobs are so short that it doesn't really matter what they use -* linux smoke test jobs use the machine executor for the same reason as the linux test jobs - -binary_run_in_docker.sh is a way to share the docker start-up code between the binary test jobs and the binary smoke test jobs - -### **Why does binary_checkout also checkout pytorch? Why shouldn't it?** - -We want all the nightly binary jobs to run on the exact same git commit, so we wrote our own checkout logic to ensure that the same commit was always picked. Later circleci changed that to use a single pytorch checkout and persist it through the workspace (they did this because our config file was too big, so they wanted to take a lot of the setup code into scripts, but the scripts needed the code repo to exist to be called, so they added a prereq step called 'setup' to checkout the code and persist the needed scripts to the workspace). The changes to the binary jobs were not properly tested, so they all broke from missing pytorch code no longer existing. We hotfixed the problem by adding the pytorch checkout back to binary_checkout, so now there's two checkouts of pytorch on the binary jobs. This problem still needs to be fixed, but it takes careful tracing of which code is being called where. - -# Code structure of the binaries (circleci agnostic) - -## Overview - -The code that runs the binaries lives in two places, in the normal [github.com/pytorch/pytorch](http://github.com/pytorch/pytorch), but also in [github.com/pytorch/builder](http://github.com/pytorch/builder), which is a repo that defines how all the binaries are built. The relevant code is - - -``` -# All code needed to set-up environments for build code to run in, -# but only code that is specific to the current CI system -pytorch/pytorch -- .circleci/ # Folder that holds all circleci related stuff - - config.yml # GENERATED file that actually controls all circleci behavior - - verbatim-sources # Used to generate job/workflow sections in ^ - - scripts/ # Code needed to prepare circleci environments for binary build scripts -- setup.py # Builds pytorch. This is wrapped in pytorch/builder -- cmake files # used in normal building of pytorch -# All code needed to prepare a binary build, given an environment -# with all the right variables/packages/paths. -pytorch/builder -# Given an installed binary and a proper python env, runs some checks -# to make sure the binary was built the proper way. Checks things like -# the library dependencies, symbols present, etc. -- check_binary.sh -# Given an installed binary, runs python tests to make sure everything -# is in order. These should be de-duped. Right now they both run smoke -# tests, but are called from different places. Usually just call some -# import statements, but also has overlap with check_binary.sh above -- run_tests.sh -- smoke_test.sh -# Folders that govern how packages are built. See paragraphs below -- conda/ - - build_pytorch.sh # Entrypoint. Delegates to proper conda build folder - - switch_cuda_version.sh # Switches activate CUDA installation in Docker - - pytorch-nightly/ # Build-folder -- manywheel/ - - build_cpu.sh # Entrypoint for cpu builds - - build.sh # Entrypoint for CUDA builds - - build_common.sh # Actual build script that ^^ call into -- wheel/ - - build_wheel.sh # Entrypoint for wheel builds -- windows/ - - build_pytorch.bat # Entrypoint for wheel builds on Windows -``` - -Every type of package has an entrypoint build script that handles the all the important logic. - -## Conda - -Linux, MacOS and Windows use the same code flow for the conda builds. - -Conda packages are built with conda-build, see https://conda.io/projects/conda-build/en/latest/resources/commands/conda-build.html - -Basically, you pass `conda build` a build folder (pytorch-nightly/ above) that contains a build script and a meta.yaml. The meta.yaml specifies in what python environment to build the package in, and what dependencies the resulting package should have, and the build script gets called in the env to build the thing. -tl;dr on conda-build is - -1. Creates a brand new conda environment, based off of deps in the meta.yaml - 1. Note that environment variables do not get passed into this build env unless they are specified in the meta.yaml - 2. If the build fails this environment will stick around. You can activate it for much easier debugging. The ā€œGeneral Pythonā€ section below explains what exactly a python ā€œenvironmentā€ is. -2. Calls build.sh in the environment -3. Copies the finished package to a new conda env, also specified by the meta.yaml -4. Runs some simple import tests (if specified in the meta.yaml) -5. Saves the finished package as a tarball - -The build.sh we use is essentially a wrapper around `python setup.py build`, but it also manually copies in some of our dependent libraries into the resulting tarball and messes with some rpaths. - -The entrypoint file `builder/conda/build_conda.sh` is complicated because - -* It works for Linux, MacOS and Windows - * The mac builds used to create their own environments, since they all used to be on the same machine. Thereā€™s now a lot of extra logic to handle conda envs. This extra machinery could be removed -* It used to handle testing too, which adds more logic messing with python environments too. This extra machinery could be removed. - -## Manywheels (linux pip and libtorch packages) - -Manywheels are pip packages for linux distros. Note that these manywheels are not actually manylinux compliant. - -`builder/manywheel/build_cpu.sh` and `builder/manywheel/build.sh` (for CUDA builds) just set different env vars and then call into `builder/manywheel/build_common.sh` - -The entrypoint file `builder/manywheel/build_common.sh` is really really complicated because - -* This used to handle building for several different python versions at the same time. The loops have been removed, but there's still unnecessary folders and movements here and there. - * The script is never used this way anymore. This extra machinery could be removed. -* This used to handle testing the pip packages too. This is why thereā€™s testing code at the end that messes with python installations and stuff - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file. -* There is a lot of messing with rpaths. This is necessary, but could be made much much simpler if the above issues were fixed. - -## Wheels (MacOS pip and libtorch packages) - -The entrypoint file `builder/wheel/build_wheel.sh` is complicated because - -* The mac builds used to all run on one machine (we didnā€™t have autoscaling mac machines till circleci). So this script handled siloing itself by setting-up and tearing-down its build env and siloing itself into its own build directory. - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * Ditto the comment above. This should definitely be separated out. - -Note that the MacOS Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda. - -## Windows Wheels (Windows pip and libtorch packages) - -The entrypoint file `builder/windows/build_pytorch.bat` is complicated because - -* This used to handle building for several different python versions at the same time. This is why there are loops everywhere - * The script is never used this way anymore. This extra machinery could be removed. -* This used to handle testing the pip packages too. This is why thereā€™s testing code at the end that messes with python installations and stuff - * The script is never used this way anymore. This extra machinery could be removed. -* This also builds libtorch packages - * This should really be separate. libtorch packages are c++ only and have no python. They should not share infra with all the python specific stuff in this file. - -Note that the Windows Python wheels are still built in conda environments. Some of the dependencies present during build also come from conda. - -## General notes - -### Note on run_tests.sh, smoke_test.sh, and check_binary.sh - -* These should all be consolidated -* These must run on all OS types: MacOS, Linux, and Windows -* These all run smoke tests at the moment. They inspect the packages some, maybe run a few import statements. They DO NOT run the python tests nor the cpp tests. The idea is that python tests on main and PR merges will catch all breakages. All these tests have to do is make sure the special binary machinery didnā€™t mess anything up. -* There are separate run_tests.sh and smoke_test.sh because one used to be called by the smoke jobs and one used to be called by the binary test jobs (see circleci structure section above). This is still true actually, but these could be united into a single script that runs these checks, given an installed pytorch package. - -### Note on libtorch - -Libtorch packages are built in the wheel build scripts: manywheel/build_*.sh for linux and build_wheel.sh for mac. There are several things wrong with this - -* Itā€™s confusing. Most of those scripts deal with python specifics. -* The extra conditionals everywhere severely complicate the wheel build scripts -* The process for building libtorch is different from the official instructions (a plain call to cmake, or a call to a script) - -### Note on docker images / Dockerfiles - -All linux builds occur in docker images. The docker images are - -* pytorch/conda-cuda - * Has ALL CUDA versions installed. The script pytorch/builder/conda/switch_cuda_version.sh sets /usr/local/cuda to a symlink to e.g. /usr/local/cuda-10.0 to enable different CUDA builds - * Also used for cpu builds -* pytorch/manylinux-cuda90 -* pytorch/manylinux-cuda100 - * Also used for cpu builds - -The Dockerfiles are available in pytorch/builder, but there is no circleci job or script to build these docker images, and they cannot be run locally (unless you have the correct local packages/paths). Only Soumith can build them right now. - -### General Python - -* This is still a good explanation of python installations https://caffe2.ai/docs/faq.html#why-do-i-get-import-errors-in-python-when-i-try-to-use-caffe2 - -# How to manually rebuild the binaries - -tl;dr make a PR that looks like https://github.com/pytorch/pytorch/pull/21159 - -Sometimes we want to push a change to mainand then rebuild all of today's binaries after that change. As of May 30, 2019 there isn't a way to manually run a workflow in the UI. You can manually re-run a workflow, but it will use the exact same git commits as the first run and will not include any changes. So we have to make a PR and then force circleci to run the binary workflow instead of the normal tests. The above PR is an example of how to do this; essentially you copy-paste the binarybuilds workflow steps into the default workflow steps. If you need to point the builder repo to a different commit then you'd need to change https://github.com/pytorch/pytorch/blob/main/.circleci/scripts/binary_checkout.sh#L42-L45 to checkout what you want. - -## How to test changes to the binaries via .circleci - -Writing PRs that test the binaries is annoying, since the default circleci jobs that run on PRs are not the jobs that you want to run. Likely, changes to the binaries will touch something under .circleci/ and require that .circleci/config.yml be regenerated (.circleci/config.yml controls all .circleci behavior, and is generated using `.circleci/regenerate.sh` in python 3.7). But you also need to manually hardcode the binary jobs that you want to test into the .circleci/config.yml workflow, so you should actually make at least two commits, one for your changes and one to temporarily hardcode jobs. See https://github.com/pytorch/pytorch/pull/22928 as an example of how to do this. - -```sh -# Make your changes -touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml -# Regenerate the yaml, has to be in python 3.7 -.circleci/regenerate.sh -# Make a commit -git add .circleci * -git commit -m "My real changes" -git push origin my_branch -# Now hardcode the jobs that you want in the .circleci/config.yml workflows section -# Also eliminate ensure-consistency and should_run_job checks -# e.g. https://github.com/pytorch/pytorch/commit/2b3344bfed8772fe86e5210cc4ee915dee42b32d -# Make a commit you won't keep -git add .circleci -git commit -m "[DO NOT LAND] testing binaries for above changes" -git push origin my_branch -# Now you need to make some changes to the first commit. -git rebase -i HEAD~2 # mark the first commit as 'edit' -# Make the changes -touch .circleci/verbatim-sources/nightly-binary-build-defaults.yml -.circleci/regenerate.sh -# Ammend the commit and recontinue -git add .circleci -git commit --amend -git rebase --continue -# Update the PR, need to force since the commits are different now -git push origin my_branch --force -``` - -The advantage of this flow is that you can make new changes to the base commit and regenerate the .circleci without having to re-write which binary jobs you want to test on. The downside is that all updates will be force pushes. - -## How to build a binary locally - -### Linux - -You can build Linux binaries locally easily using docker. - -```sh -# Run the docker -# Use the correct docker image, pytorch/conda-cuda used here as an example -# -# -v path/to/foo:path/to/bar makes path/to/foo on your local machine (the -# machine that you're running the command on) accessible to the docker -# container at path/to/bar. So if you then run `touch path/to/bar/baz` -# in the docker container then you will see path/to/foo/baz on your local -# machine. You could also clone the pytorch and builder repos in the docker. -# -# If you know how, add ccache as a volume too and speed up everything -docker run \ - -v your/pytorch/repo:/pytorch \ - -v your/builder/repo:/builder \ - -v where/you/want/packages/to/appear:/final_pkgs \ - -it pytorch/conda-cuda /bin/bash -# Export whatever variables are important to you. All variables that you'd -# possibly need are in .circleci/scripts/binary_populate_env.sh -# You should probably always export at least these 3 variables -export PACKAGE_TYPE=conda -export DESIRED_PYTHON=3.7 -export DESIRED_CUDA=cpu -# Call the entrypoint -# `|& tee foo.log` just copies all stdout and stderr output to foo.log -# The builds generate lots of output so you probably need this when -# building locally. -/builder/conda/build_pytorch.sh |& tee build_output.log -``` - -**Building CUDA binaries on docker** - -You can build CUDA binaries on CPU only machines, but you can only run CUDA binaries on CUDA machines. This means that you can build a CUDA binary on a docker on your laptop if you so choose (though itā€™s gonna take a long time). - -For Facebook employees, ask about beefy machines that have docker support and use those instead of your laptop; it will be 5x as fast. - -### MacOS - -Thereā€™s no easy way to generate reproducible hermetic MacOS environments. If you have a Mac laptop then you can try emulating the .circleci environments as much as possible, but you probably have packages in /usr/local/, possibly installed by brew, that will probably interfere with the build. If youā€™re trying to repro an error on a Mac build in .circleci and you canā€™t seem to repro locally, then my best advice is actually to iterate on .circleci :/ - -But if you want to try, then Iā€™d recommend - -```sh -# Create a new terminal -# Clear your LD_LIBRARY_PATH and trim as much out of your PATH as you -# know how to do -# Install a new miniconda -# First remove any other python or conda installation from your PATH -# Always install miniconda 3, even if building for Python <3 -new_conda="~/my_new_conda" -conda_sh="$new_conda/install_miniconda.sh" -curl -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-MacOSX-x86_64.sh -chmod +x "$conda_sh" -"$conda_sh" -b -p "$MINICONDA_ROOT" -rm -f "$conda_sh" -export PATH="~/my_new_conda/bin:$PATH" -# Create a clean python env -# All MacOS builds use conda to manage the python env and dependencies -# that are built with, even the pip packages -conda create -yn binary python=2.7 -conda activate binary -# Export whatever variables are important to you. All variables that you'd -# possibly need are in .circleci/scripts/binary_populate_env.sh -# You should probably always export at least these 3 variables -export PACKAGE_TYPE=conda -export DESIRED_PYTHON=3.7 -export DESIRED_CUDA=cpu -# Call the entrypoint you want -path/to/builder/wheel/build_wheel.sh -``` - -N.B. installing a brand new miniconda is important. This has to do with how conda installations work. See the ā€œGeneral Pythonā€ section above, but tldr; is that - -1. You make the ā€˜condaā€™ command accessible by prepending `path/to/conda_root/bin` to your PATH. -2. You make a new env and activate it, which then also gets prepended to your PATH. Now you have `path/to/conda_root/envs/new_env/bin:path/to/conda_root/bin:$PATH` -3. Now say you (or some code that you ran) call python executable `foo` - 1. if you installed `foo` in `new_env`, then `path/to/conda_root/envs/new_env/bin/foo` will get called, as expected. - 2. But if you forgot to installed `foo` in `new_env` but happened to previously install it in your root conda env (called ā€˜baseā€™), then unix/linux will still find `path/to/conda_root/bin/foo` . This is dangerous, since `foo` can be a different version than you want; `foo` can even be for an incompatible python version! - -Newer conda versions and proper python hygiene can prevent this, but just install a new miniconda to be safe. - -### Windows - -TODO: fill in +PyTorch migration from CircleCI to github actions has been completed. All continuous integration & deployment workflows are defined in `.github/workflows` folder diff --git a/.circleci/scripts/binary_checkout.sh b/.circleci/scripts/binary_checkout.sh deleted file mode 100755 index 7bcf0b7b6431d..0000000000000 --- a/.circleci/scripts/binary_checkout.sh +++ /dev/null @@ -1,69 +0,0 @@ -#!/bin/bash -set -eux -o pipefail - -retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) -} - - -# This step runs on multiple executors with different envfile locations -if [[ "$(uname)" == Darwin ]]; then - # macos executor (builds and tests) - workdir="/Users/distiller/project" -elif [[ "$OSTYPE" == "msys" ]]; then - # windows executor (builds and tests) - rm -rf /c/w - ln -s "/c/Users/circleci/project" /c/w - workdir="/c/w" -elif [[ -d "/home/circleci/project" ]]; then - # machine executor (binary tests) - workdir="/home/circleci/project" -else - # docker executor (binary builds) - workdir="/" -fi - -# It is very important that this stays in sync with binary_populate_env.sh -if [[ "$OSTYPE" == "msys" ]]; then - # We need to make the paths as short as possible on Windows - export PYTORCH_ROOT="$workdir/p" - export BUILDER_ROOT="$workdir/b" -else - export PYTORCH_ROOT="$workdir/pytorch" - export BUILDER_ROOT="$workdir/builder" -fi - -# Try to extract PR number from branch if not already set -if [[ -z "${CIRCLE_PR_NUMBER:-}" ]]; then - CIRCLE_PR_NUMBER="$(echo ${CIRCLE_BRANCH} | sed -E -n 's/pull\/([0-9]*).*/\1/p')" -fi - -# Clone the Pytorch branch -retry git clone https://github.com/pytorch/pytorch.git "$PYTORCH_ROOT" -pushd "$PYTORCH_ROOT" -if [[ -n "${CIRCLE_PR_NUMBER:-}" ]]; then - # "smoke" binary build on PRs - git fetch --force origin "pull/${CIRCLE_PR_NUMBER}/head:remotes/origin/pull/${CIRCLE_PR_NUMBER}" - git reset --hard "$CIRCLE_SHA1" - git checkout -q -B "$CIRCLE_BRANCH" - git reset --hard "$CIRCLE_SHA1" -elif [[ -n "${CIRCLE_SHA1:-}" ]]; then - # Scheduled workflows & "smoke" binary build on trunk on PR merges - DEFAULT_BRANCH="$(git remote show $CIRCLE_REPOSITORY_URL | awk '/HEAD branch/ {print $NF}')" - git reset --hard "$CIRCLE_SHA1" - git checkout -q -B $DEFAULT_BRANCH -else - echo "Can't tell what to checkout" - exit 1 -fi -retry git submodule update --init --recursive -echo "Using Pytorch from " -git --no-pager log --max-count 1 -popd - -# Clone the Builder main repo -retry git clone -q https://github.com/pytorch/builder.git "$BUILDER_ROOT" -pushd "$BUILDER_ROOT" -echo "Using builder from " -git --no-pager log --max-count 1 -popd diff --git a/.circleci/scripts/binary_install_miniconda.sh b/.circleci/scripts/binary_install_miniconda.sh deleted file mode 100755 index ce08805bd5b04..0000000000000 --- a/.circleci/scripts/binary_install_miniconda.sh +++ /dev/null @@ -1,44 +0,0 @@ -#!/bin/bash - -set -eux -o pipefail - -# This step runs on multiple executors with different envfile locations -if [[ "$(uname)" == Darwin ]]; then - envfile="/Users/distiller/project/env" -elif [[ -d "/home/circleci/project" ]]; then - # machine executor (binary tests) - envfile="/home/circleci/project/env" -else - # docker executor (binary builds) - envfile="/env" -fi - -# TODO this is super hacky and ugly. Basically, the binary_update_html job does -# not have an env file, since it does not call binary_populate_env.sh, since it -# does not have a BUILD_ENVIRONMENT. So for this one case, which we detect by a -# lack of an env file, we manually export the environment variables that we -# need to install miniconda -if [[ ! -f "$envfile" ]]; then - MINICONDA_ROOT="/home/circleci/project/miniconda" - workdir="/home/circleci/project" - retry () { - $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*) - } - export -f retry -else - source "$envfile" -fi - -conda_sh="$workdir/install_miniconda.sh" -if [[ "$(uname)" == Darwin ]]; then - curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-py39_4.12.0-MacOSX-x86_64.sh -else - curl --retry 3 --retry-all-errors -o "$conda_sh" https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -fi -chmod +x "$conda_sh" -"$conda_sh" -b -p "$MINICONDA_ROOT" -rm -f "$conda_sh" - -# We can't actually add miniconda to the PATH in the envfile, because that -# breaks 'unbuffer' in Mac jobs. This is probably because conda comes with -# a tclsh, which then gets inserted before the tclsh needed in /usr/bin diff --git a/.circleci/scripts/binary_macos_build.sh b/.circleci/scripts/binary_macos_build.sh index 8ee131de0435f..3f9e6e8eb5156 100755 --- a/.circleci/scripts/binary_macos_build.sh +++ b/.circleci/scripts/binary_macos_build.sh @@ -4,10 +4,6 @@ set -eux -o pipefail source "${BINARY_ENV_FILE:-/Users/distiller/project/env}" mkdir -p "$PYTORCH_FINAL_PACKAGE_DIR" -if [[ -z "${GITHUB_ACTIONS:-}" ]]; then - export PATH="${workdir:-${HOME}}/miniconda/bin:${PATH}" -fi - # Build export USE_PYTORCH_METAL_EXPORT=1 export USE_COREML_DELEGATE=1 diff --git a/.circleci/scripts/binary_populate_env.sh b/.circleci/scripts/binary_populate_env.sh index 668de45e2c7b1..287423641d777 100755 --- a/.circleci/scripts/binary_populate_env.sh +++ b/.circleci/scripts/binary_populate_env.sh @@ -3,17 +3,9 @@ set -eux -o pipefail export TZ=UTC tagged_version() { - # Grabs version from either the env variable CIRCLE_TAG - # or the pytorch git described version - if [[ "$OSTYPE" == "msys" && -z "${GITHUB_ACTIONS:-}" ]]; then - GIT_DIR="${workdir}/p/.git" - else - GIT_DIR="${workdir}/pytorch/.git" - fi + GIT_DIR="${workdir}/pytorch/.git" GIT_DESCRIBE="git --git-dir ${GIT_DIR} describe --tags --match v[0-9]*.[0-9]*.[0-9]*" - if [[ -n "${CIRCLE_TAG:-}" ]]; then - echo "${CIRCLE_TAG}" - elif [[ ! -d "${GIT_DIR}" ]]; then + if [[ ! -d "${GIT_DIR}" ]]; then echo "Abort, abort! Git dir ${GIT_DIR} does not exists!" kill $$ elif ${GIT_DESCRIBE} --exact >/dev/null; then @@ -59,6 +51,7 @@ PIP_UPLOAD_FOLDER='nightly/' # We put this here so that OVERRIDE_PACKAGE_VERSION below can read from it export DATE="$(date -u +%Y%m%d)" BASE_BUILD_VERSION="$(cat ${PYTORCH_ROOT}/version.txt|cut -da -f1).dev${DATE}" + # Change BASE_BUILD_VERSION to git tag when on a git tag # Use 'git -C' to make doubly sure we're in the correct directory for checking # the git tag @@ -78,6 +71,35 @@ fi export PYTORCH_BUILD_NUMBER=1 +# Set triton version as part of PYTORCH_EXTRA_INSTALL_REQUIREMENTS +TRITON_VERSION=$(cat $PYTORCH_ROOT/.ci/docker/triton_version.txt) + +# Here PYTORCH_EXTRA_INSTALL_REQUIREMENTS is already set for the all the wheel builds hence append TRITON_CONSTRAINT +if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then + # Only linux Python < 3.12 are supported wheels for triton + TRITON_CONSTRAINT="platform_system == 'Linux' and platform_machine == 'x86_64' and python_version < '3.12'" + TRITON_REQUIREMENT="triton==${TRITON_VERSION}; ${TRITON_CONSTRAINT}" + if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then + TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton.txt) + TRITON_REQUIREMENT="pytorch-triton==${TRITON_VERSION}+${TRITON_SHORTHASH}; ${TRITON_CONSTRAINT}" + fi + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" +fi + +# Set triton via PYTORCH_EXTRA_INSTALL_REQUIREMENTS for triton rocm package +if [[ "$PACKAGE_TYPE" =~ .*wheel.* && -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*rocm.* && $(uname) == "Linux" && "$DESIRED_PYTHON" != "3.12" ]]; then + TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}" + if [[ -n "$PYTORCH_BUILD_VERSION" && "$PYTORCH_BUILD_VERSION" =~ .*dev.* ]]; then + TRITON_SHORTHASH=$(cut -c1-10 $PYTORCH_ROOT/.ci/docker/ci_commit_pins/triton-rocm.txt) + TRITON_REQUIREMENT="pytorch-triton-rocm==${TRITON_VERSION}+${TRITON_SHORTHASH}" + fi + if [[ -z "${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" ]]; then + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${TRITON_REQUIREMENT}" + else + export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS} | ${TRITON_REQUIREMENT}" + fi +fi + JAVA_HOME= BUILD_JNI=OFF if [[ "$PACKAGE_TYPE" == libtorch ]]; then @@ -123,12 +145,13 @@ if [[ "${OSTYPE}" == "msys" ]]; then else export DESIRED_DEVTOOLSET="${DESIRED_DEVTOOLSET:-}" fi -export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" + export DATE="$DATE" export NIGHTLIES_DATE_PREAMBLE=1.14.0.dev export PYTORCH_BUILD_VERSION="$PYTORCH_BUILD_VERSION" export PYTORCH_BUILD_NUMBER="$PYTORCH_BUILD_NUMBER" export OVERRIDE_PACKAGE_VERSION="$PYTORCH_BUILD_VERSION" +export PYTORCH_EXTRA_INSTALL_REQUIREMENTS="${PYTORCH_EXTRA_INSTALL_REQUIREMENTS:-}" # TODO: We don't need this anymore IIUC export TORCH_PACKAGE_NAME='torch' @@ -161,28 +184,6 @@ if [[ "$(uname)" != Darwin ]]; then EOL fi -if [[ -z "${GITHUB_ACTIONS:-}" ]]; then - cat >>"$envfile" <> "$envfile" echo ' $* || (sleep 1 && $*) || (sleep 2 && $*) || (sleep 4 && $*) || (sleep 8 && $*)' >> "$envfile" echo '}' >> "$envfile" diff --git a/.circleci/scripts/binary_run_in_docker.sh b/.circleci/scripts/binary_run_in_docker.sh deleted file mode 100755 index 4af14becb4264..0000000000000 --- a/.circleci/scripts/binary_run_in_docker.sh +++ /dev/null @@ -1,29 +0,0 @@ -#!/bin/bash - -# This section is used in the binary_test and smoke_test jobs. It expects -# 'binary_populate_env' to have populated /home/circleci/project/env and it -# expects another section to populate /home/circleci/project/ci_test_script.sh -# with the code to run in the docker - -# Expect all needed environment variables to be written to this file -source /home/circleci/project/env -echo "Running the following code in Docker" -cat /home/circleci/project/ci_test_script.sh -echo -echo -set -eux -o pipefail - -# Expect actual code to be written to this file -chmod +x /home/circleci/project/ci_test_script.sh - -VOLUME_MOUNTS="-v /home/circleci/project/:/circleci_stuff -v /home/circleci/project/final_pkgs:/final_pkgs -v ${PYTORCH_ROOT}:/pytorch -v ${BUILDER_ROOT}:/builder" -# Run the docker -if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then - export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined --gpus all ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}") -else - export id=$(docker run --cap-add=SYS_PTRACE --security-opt seccomp=unconfined ${VOLUME_MOUNTS} -t -d "${DOCKER_IMAGE}") -fi - -# Execute the test script that was populated by an earlier section -export COMMAND='((echo "source /circleci_stuff/env && /circleci_stuff/ci_test_script.sh") | docker exec -i "$id" bash) 2>&1' -echo ${COMMAND} > ./command.sh && unbuffer bash ./command.sh | ts diff --git a/.circleci/scripts/setup_ci_environment.sh b/.circleci/scripts/setup_ci_environment.sh deleted file mode 100755 index 42a605cd44451..0000000000000 --- a/.circleci/scripts/setup_ci_environment.sh +++ /dev/null @@ -1,111 +0,0 @@ -#!/usr/bin/env bash -set -ex -o pipefail - -# Remove unnecessary sources -sudo rm -f /etc/apt/sources.list.d/google-chrome.list -sudo rm -f /etc/apt/heroku.list -sudo rm -f /etc/apt/openjdk-r-ubuntu-ppa-xenial.list -sudo rm -f /etc/apt/partner.list - -# To increase the network reliability, let apt decide which mirror is best to use -sudo sed -i -e 's/http:\/\/.*archive/mirror:\/\/mirrors/' -e 's/\/ubuntu\//\/mirrors.txt/' /etc/apt/sources.list - -retry () { - $* || $* || $* || $* || $* -} - -# Method adapted from here: https://askubuntu.com/questions/875213/apt-get-to-retry-downloading -# (with use of tee to avoid permissions problems) -# This is better than retrying the whole apt-get command -echo "APT::Acquire::Retries \"3\";" | sudo tee /etc/apt/apt.conf.d/80-retries - -retry sudo apt-get update -qq -retry sudo apt-get -y install \ - moreutils \ - expect-dev - -echo "== DOCKER VERSION ==" -docker version - -if ! command -v aws >/dev/null; then - retry sudo pip3 -q install awscli==1.19.64 -fi - -if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then - DRIVER_FN="NVIDIA-Linux-x86_64-515.76.run" - wget "https://s3.amazonaws.com/ossci-linux/nvidia_driver/$DRIVER_FN" - sudo /bin/bash "$DRIVER_FN" -s --no-drm || (sudo cat /var/log/nvidia-installer.log && false) - nvidia-smi - - # Taken directly from https://github.com/NVIDIA/nvidia-docker - # Add the package repositories - distribution=$(. /etc/os-release;echo "$ID$VERSION_ID") - curl -s -L --retry 3 --retry-all-errors https://nvidia.github.io/nvidia-docker/gpgkey | sudo apt-key add - - curl -s -L --retry 3 --retry-all-errors "https://nvidia.github.io/nvidia-docker/${distribution}/nvidia-docker.list" | sudo tee /etc/apt/sources.list.d/nvidia-docker.list - - retry sudo apt-get update -qq - # Necessary to get the `--gpus` flag to function within docker - retry sudo apt-get install -y nvidia-container-toolkit - sudo systemctl restart docker -else - # Explicitly remove nvidia docker apt repositories if not building for cuda - sudo rm -rf /etc/apt/sources.list.d/nvidia-docker.list -fi - -add_to_env_file() { - local name=$1 - local value=$2 - case "$value" in - *\ *) - # BASH_ENV should be set by CircleCI - echo "${name}='${value}'" >> "${BASH_ENV:-/tmp/env}" - ;; - *) - echo "${name}=${value}" >> "${BASH_ENV:-/tmp/env}" - ;; - esac -} - -add_to_env_file CI_MASTER "${CI_MASTER:-}" -add_to_env_file COMMIT_SOURCE "${CIRCLE_BRANCH:-}" -add_to_env_file BUILD_ENVIRONMENT "${BUILD_ENVIRONMENT}" -add_to_env_file CIRCLE_PULL_REQUEST "${CIRCLE_PULL_REQUEST}" - - -if [[ "${BUILD_ENVIRONMENT}" == *-build ]]; then - add_to_env_file SCCACHE_BUCKET ossci-compiler-cache-circleci-v2 - - SCCACHE_MAX_JOBS=$(( $(nproc) - 1 )) - MEMORY_LIMIT_MAX_JOBS=8 # the "large" resource class on CircleCI has 32 CPU cores, if we use all of them we'll OOM - MAX_JOBS=$(( ${SCCACHE_MAX_JOBS} > ${MEMORY_LIMIT_MAX_JOBS} ? ${MEMORY_LIMIT_MAX_JOBS} : ${SCCACHE_MAX_JOBS} )) - add_to_env_file MAX_JOBS "${MAX_JOBS}" - - if [ -n "${USE_CUDA_DOCKER_RUNTIME:-}" ]; then - add_to_env_file TORCH_CUDA_ARCH_LIST 5.2 - fi - - if [[ "${BUILD_ENVIRONMENT}" == *xla* ]]; then - # This IAM user allows write access to S3 bucket for sccache & bazels3cache - set +x - add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}" - add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" - add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_AND_XLA_BAZEL_S3_BUCKET_V2:-}" - set -x - else - # This IAM user allows write access to S3 bucket for sccache - set +x - add_to_env_file XLA_CLANG_CACHE_S3_BUCKET_NAME "${XLA_CLANG_CACHE_S3_BUCKET_NAME:-}" - add_to_env_file AWS_ACCESS_KEY_ID "${CIRCLECI_AWS_ACCESS_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" - add_to_env_file AWS_SECRET_ACCESS_KEY "${CIRCLECI_AWS_SECRET_KEY_FOR_SCCACHE_S3_BUCKET_V4:-}" - set -x - fi -fi - -# This IAM user only allows read-write access to ECR -set +x -export AWS_ACCESS_KEY_ID=${CIRCLECI_AWS_ACCESS_KEY_FOR_ECR_READ_WRITE_V4:-} -export AWS_SECRET_ACCESS_KEY=${CIRCLECI_AWS_SECRET_KEY_FOR_ECR_READ_WRITE_V4:-} -export AWS_ACCOUNT_ID=$(aws sts get-caller-identity|grep Account|cut -f4 -d\") -export AWS_REGION=us-east-1 -aws ecr get-login-password --region $AWS_REGION|docker login --username AWS --password-stdin $AWS_ACCOUNT_ID.dkr.ecr.$AWS_REGION.amazonaws.com -set -x diff --git a/.circleci/scripts/setup_linux_system_environment.sh b/.circleci/scripts/setup_linux_system_environment.sh deleted file mode 100755 index 780f7c1bd3790..0000000000000 --- a/.circleci/scripts/setup_linux_system_environment.sh +++ /dev/null @@ -1,50 +0,0 @@ -#!/usr/bin/env bash -set -eux -o pipefail - -# Set up CircleCI GPG keys for apt, if needed -curl --retry 3 --retry-all-errors -s -L https://packagecloud.io/circleci/trusty/gpgkey | sudo apt-key add - - -# Stop background apt updates. Hypothetically, the kill should not -# be necessary, because stop is supposed to send a kill signal to -# the process, but we've added it for good luck. Also -# hypothetically, it's supposed to be unnecessary to wait for -# the process to block. We also have that line for good luck. -# If you like, try deleting them and seeing if it works. -sudo systemctl stop apt-daily.service || true -sudo systemctl kill --kill-who=all apt-daily.service || true - -sudo systemctl stop unattended-upgrades.service || true -sudo systemctl kill --kill-who=all unattended-upgrades.service || true - -# wait until `apt-get update` has been killed -while systemctl is-active --quiet apt-daily.service -do - sleep 1; -done -while systemctl is-active --quiet unattended-upgrades.service -do - sleep 1; -done - -# See if we actually were successful -systemctl list-units --all | cat - -# For good luck, try even harder to kill apt-get -sudo pkill apt-get || true - -# For even better luck, purge unattended-upgrades -sudo apt-get purge -y unattended-upgrades || true - -cat /etc/apt/sources.list - -# For the bestest luck, kill again now -sudo pkill apt || true -sudo pkill dpkg || true - -# Try to detect if apt/dpkg is stuck -if ps auxfww | grep '[a]pt'; then - echo "WARNING: There are leftover apt processes; subsequent apt update will likely fail" -fi -if ps auxfww | grep '[d]pkg'; then - echo "WARNING: There are leftover dpkg processes; subsequent apt update will likely fail" -fi diff --git a/.clang-tidy b/.clang-tidy index 7e2313b94c061..d0d74d154c6cf 100644 --- a/.clang-tidy +++ b/.clang-tidy @@ -36,13 +36,13 @@ hicpp-exception-baseclass, hicpp-avoid-goto, misc-*, -misc-const-correctness, +-misc-include-cleaner, -misc-use-anonymous-namespace, -misc-unused-parameters, -misc-no-recursion, -misc-non-private-member-variables-in-classes, -misc-confusable-identifiers, modernize-*, --modernize-concat-nested-namespaces, -modernize-macro-to-enum, -modernize-return-braced-init-list, -modernize-use-auto, diff --git a/.devcontainer/Dockerfile b/.devcontainer/Dockerfile index 97f56d61cba89..e151576219af2 100644 --- a/.devcontainer/Dockerfile +++ b/.devcontainer/Dockerfile @@ -30,5 +30,5 @@ RUN if [ -n "$CLANG_VERSION" ]; then \ # Install cuda if version is specified ARG CUDA_VERSION RUN if [ -n "$CUDA_VERSION" ]; then \ - conda install cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \ + conda install -y cuda -c "nvidia/label/cuda-${CUDA_VERSION}"; \ fi diff --git a/.devcontainer/README.md b/.devcontainer/README.md index ef4067a326012..17e4e4958ca85 100644 --- a/.devcontainer/README.md +++ b/.devcontainer/README.md @@ -46,7 +46,7 @@ If you are using [Visual Studio Code Remote - SSH](https://code.visualstudio.com ## Step 6: Open in DevContainer -1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Remote-Containers: Open Folder in Container..." command. +1. In VSCode, use the Command Palette (`Ctrl+Shift+P` or `Cmd+Shift+P` on macOS) to run the "Dev Containers: Open Folder in Container..." command. 2. You will be prompted with two options: CPU dev container or CUDA dev container. Choose the one you want to run. ## Step 7: Wait for Building the Environment diff --git a/.flake8 b/.flake8 index c59af78be7bc4..e3a90f36aaf9a 100644 --- a/.flake8 +++ b/.flake8 @@ -2,7 +2,7 @@ # NOTE: **Mirror any changes** to this file the [tool.ruff] config in pyproject.toml # before we can fully move to use ruff enable-extensions = G -select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2 +select = B,C,E,F,G,P,SIM1,T4,W,B9,TOR0,TOR1,TOR2,TOR9 max-line-length = 120 # C408 ignored because we like the dict keyword argument syntax # E501 is not flexible enough, we're using B950 instead @@ -27,6 +27,9 @@ ignore = # TODO(kit1980): fix all TOR102 issues # `torch.load` without `weights_only` parameter is unsafe TOR102, + # TODO(kit1980): resolve all TOR003 issues + # pass `use_reentrant` explicitly to `checkpoint`. + TOR003 per-file-ignores = __init__.py: F401 test/**: F821 @@ -34,6 +37,24 @@ per-file-ignores = torch/utils/cpp_extension.py: B950 torchgen/api/types/__init__.py: F401,F403 torchgen/executorch/api/types/__init__.py: F401,F403 + test/dynamo/test_higher_order_ops.py: B950 + torch/testing/_internal/dynamo_test_failures.py: B950 + # TOR901 is only for test, we want to ignore it for everything else. + # It's not easy to configure this without affecting other per-file-ignores, + # so we explicitly list every file where it's violated outside of test. + torch/__init__.py: F401,TOR901 + torch/_custom_op/impl.py: TOR901 + torch/_export/serde/upgrade.py: TOR901 + torch/_functorch/vmap.py: TOR901 + torch/_inductor/test_operators.py: TOR901 + torch/_library/abstract_impl.py: TOR901 + torch/_meta_registrations.py: TOR901 + torch/_prims/__init__.py: F401,TOR901 + torch/_prims/rng_prims.py: TOR901 + torch/ao/quantization/fx/_decomposed.py: TOR901 + torch/distributed/_functional_collectives.py: TOR901 + torch/distributed/_spmd/data_parallel.py: TOR901 + torch/distributed/_tensor/_collective_utils.py: TOR901 optional-ascii-coding = True exclude = ./.git, diff --git a/.gitattributes b/.gitattributes index 8bccf04bbb7dc..e904301752950 100644 --- a/.gitattributes +++ b/.gitattributes @@ -4,3 +4,4 @@ .github/generated-* linguist-generated=true .github/scripts/gql_mocks.json linguist-generated=true third_party/LICENSES_BUNDLED.txt linguist-generated=true +tools/build/bazel/requirements.txt linguist-generated=true diff --git a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml index c4dc9aa772fee..7ba631fb05cc6 100644 --- a/.github/ISSUE_TEMPLATE/pt2-bug-report.yml +++ b/.github/ISSUE_TEMPLATE/pt2-bug-report.yml @@ -8,7 +8,18 @@ body: value: > #### Before submitting a bug, please make sure the issue hasn't been already addressed by searching through [the existing and past issues](https://github.com/pytorch/pytorch/issues) - It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/master/dynamo/index.html) + It's likely that your bug will be resolved by checking our FAQ or troubleshooting guide [documentation](https://pytorch.org/docs/main/dynamo/index.html) + + Note: if you're submitting an issue that you generated from a fuzzer. Please do the following: + + - Ensure rtol/atol are at default tolerances + + - Dont compare indices of max/min etc, because that avoids the above requirement + + - If comparing eager and torch.compile at fp16/bf16, you should use fp32 as baseline + + If the above requirements are met, add the label "topic: fuzzer" to your issue. + - type: textarea attributes: label: šŸ› Describe the bug @@ -33,7 +44,7 @@ body: label: Minified repro description: | Please run the minifier on your example and paste the minified code below - Learn more here https://pytorch.org/docs/master/compile/troubleshooting.html + Learn more here https://pytorch.org/docs/main/torch.compiler_troubleshooting.html placeholder: | env TORCHDYNAMO_REPRO_AFTER="aot" python your_model.py or diff --git a/.github/actionlint.yaml b/.github/actionlint.yaml index 58cdbd2659768..05fc1243251b1 100644 --- a/.github/actionlint.yaml +++ b/.github/actionlint.yaml @@ -19,8 +19,9 @@ self-hosted-runner: - windows.g5.4xlarge.nvidia.gpu - bm-runner - linux.rocm.gpu - - macos-m1-12 + - macos-m1-stable - macos-m1-13 + - macos-m1-14 - macos-12-xl - macos-12 - macos12.3-m1 diff --git a/.github/actions/download-build-artifacts/action.yml b/.github/actions/download-build-artifacts/action.yml index a7107f2067def..2deeda72802dd 100644 --- a/.github/actions/download-build-artifacts/action.yml +++ b/.github/actions/download-build-artifacts/action.yml @@ -9,6 +9,10 @@ inputs: use-gha: description: If set to any value, use GHA to download the artifact. Otherwise use s3. required: false + s3-bucket: + description: S3 bucket to download builds + required: false + default: "gha-artifacts" runs: using: composite @@ -18,9 +22,10 @@ runs: uses: seemethere/download-artifact-s3@v4 with: name: ${{ inputs.name }} + s3-bucket: ${{ inputs.s3-bucket }} - name: Download PyTorch Build Artifacts from GHA - if: inputs.use-gha + if: ${{ inputs.use-gha }} uses: actions/download-artifact@v3 with: name: ${{ inputs.name }} @@ -29,6 +34,10 @@ runs: shell: bash run: unzip -o artifacts.zip + - name: Remove artifacts.zip + shell: bash + run: rm artifacts.zip + - name: Output disk space left shell: bash run: df -H diff --git a/.github/actions/download-td-artifacts/action.yml b/.github/actions/download-td-artifacts/action.yml new file mode 100644 index 0000000000000..595093abaead0 --- /dev/null +++ b/.github/actions/download-td-artifacts/action.yml @@ -0,0 +1,29 @@ +name: Download TD Artifacts + +description: Download artifacts from target_determination.yml + +inputs: + use-gha: + description: If set to any value, use GHA to download the artifact. Otherwise use s3. + required: false + +runs: + using: composite + steps: + - name: Download TD Artifacts from S3 + if: ${{ !inputs.use-gha }} + uses: seemethere/download-artifact-s3@v4 + with: + name: td_results + + - name: Download TD Artifacts from GHA + if: inputs.use-gha + uses: actions/download-artifact@v3 + with: + name: td_results.json + + - name: Move artifacts to .additional_ci_files folder + shell: bash + run: | + mkdir -p .additional_ci_files + mv td_results.json .additional_ci_files/td_results.json diff --git a/.github/actions/filter-test-configs/action.yml b/.github/actions/filter-test-configs/action.yml index 3fb107a17b95e..e1f2067d58076 100644 --- a/.github/actions/filter-test-configs/action.yml +++ b/.github/actions/filter-test-configs/action.yml @@ -13,6 +13,13 @@ inputs: required: true type: string description: JSON description of what test configs to run. + selected-test-configs: + required: false + type: string + description: | + A comma-separated list of test configurations from the test matrix to keep, + The empty list means we are going to keep every configurations by defaults + default: "" job-name: type: string required: false @@ -26,11 +33,23 @@ outputs: description: True if the filtered test configs matrix is empty. False otherwise. value: ${{ steps.filter.outputs.is-test-matrix-empty }} keep-going: - description: True if keep-going label was on PR. + description: True if keep-going label was on PR or [keep-going] in PR body. value: ${{ steps.filter.outputs.keep-going }} reenabled-issues: description: Comma separated list of issue numbers that should correspond to disable test issues that the PR fixes value: ${{ steps.filter.outputs.reenabled-issues }} + ci-verbose-test-logs: + description: True if ci-verbose-test-logs label was on PR or [ci-verbose-test-logs] in PR body. + value: ${{ steps.filter.outputs.ci-verbose-test-logs }} + ci-no-test-timeout: + description: True if ci-no-test-timeout label was on PR or [ci-no-test-timeout] in PR body. + value: ${{ steps.filter.outputs.ci-no-test-timeout }} + ci-no-td: + description: True if ci-no-td label was on PR or [ci-no-td] in PR body. + value: ${{ steps.filter.outputs.ci-no-td }} + ci-td-distributed: + description: True if ci-td-distributed label was on PR or [ci-td-distributed] in PR body. + value: ${{ steps.filter.outputs.ci-td-distributed }} runs: using: composite @@ -114,6 +133,7 @@ runs: --workflow "${GITHUB_WORKFLOW}" \ --job-name "${JOB_NAME}" \ --test-matrix "${{ inputs.test-matrix }}" \ + --selected-test-configs "${{ inputs.selected-test-configs }}" \ --pr-number "${PR_NUMBER}" \ --tag "${TAG}" \ --event-name "${EVENT_NAME}" \ diff --git a/.github/actions/linux-build/action.yml b/.github/actions/linux-build/action.yml new file mode 100644 index 0000000000000..c0f74160507bb --- /dev/null +++ b/.github/actions/linux-build/action.yml @@ -0,0 +1,207 @@ +name: linux-build + +inputs: + build-environment: + required: true + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + description: Name of the base docker image to build with. + build-generates-artifacts: + required: false + default: "true" + description: If set, upload generated build artifacts. + build-with-debug: + required: false + default: "false" + description: If set, build in debug mode. + sync-tag: + required: false + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + cuda-arch-list: + required: false + default: "5.2" + description: Runner label to select worker type + runner: + required: false + default: "linux.2xlarge" + description: | + List of CUDA architectures CI build should target. + test-matrix: + required: false + type: string + description: | + An option JSON description of what test configs to run later on. This + is moved here from the Linux test workflow so that we can apply filter + logic using test-config labels earlier and skip unnecessary builds + s3-bucket: + description: S3 bucket to download artifact + required: false + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + default: "" + GITHUB_TOKEN: + description: GitHub token + required: true + HUGGING_FACE_HUB_TOKEN: + description: Hugging Face Hub token + required: false + default: "" +outputs: + docker-image: + value: ${{ steps.calculate-docker-image.outputs.docker-image }} + description: The docker image containing the built PyTorch. + test-matrix: + value: ${{ steps.filter.outputs.test-matrix }} + description: An optional JSON description of what test configs to run later on. + +runs: + using: composite + steps: + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v3 + if: ${{ inputs.aws-role-to-assume != '' }} + with: + role-to-assume: ${{ inputs.aws-role-to-assume }} + role-session-name: gha-linux-build + role-duration-seconds: 10800 + aws-region: us-east-1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ inputs.docker-image-name }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*/} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Parse ref + id: parse-ref + shell: bash + run: .github/scripts/parse_ref.py + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ inputs.GITHUB_TOKEN }} + + # Apply the filter logic to the build step too if the test-config label is already there + - name: Select all requested test configurations (if the test matrix is available) + id: filter + uses: ./.github/actions/filter-test-configs + with: + github-token: ${{ inputs.GITHUB_TOKEN }} + test-matrix: ${{ inputs.test-matrix }} + job-name: ${{ steps.get-job-id.outputs.job-name }} + + - name: Download pytest cache + uses: ./.github/actions/pytest-cache-download + continue-on-error: true + with: + cache_dir: .pytest_cache + job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} + s3_bucket: ${{ inputs.s3-bucket }} + + - name: Build + if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' + id: build + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + # TODO duplicated + AWS_DEFAULT_REGION: us-east-1 + PR_NUMBER: ${{ github.event.pull_request.number }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + PR_LABELS: ${{ toJson(github.event.pull_request.labels.*.name) }} + TORCH_CUDA_ARCH_LIST: ${{ inputs.cuda-arch-list }} + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} + DEBUG: ${{ inputs.build-with-debug == 'true' && '1' || '0' }} + OUR_GITHUB_JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }} + shell: bash + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + -e BUILD_ENVIRONMENT \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + -e PR_NUMBER \ + -e SHA1 \ + -e BRANCH \ + -e SCCACHE_BUCKET \ + -e SCCACHE_S3_KEY_PREFIX \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e TORCH_CUDA_ARCH_LIST \ + -e PR_LABELS \ + -e OUR_GITHUB_JOB_ID \ + -e HUGGING_FACE_HUB_TOKEN \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + docker exec -t "${container_name}" sh -c '.ci/pytorch/build.sh' + + - name: Archive artifacts into zip + if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' + shell: bash + run: | + zip -1 -r artifacts.zip dist/ build/custom_test_artifacts build/lib build/bin .additional_ci_files + + - name: Store PyTorch Build Artifacts on S3 + uses: seemethere/upload-artifact-s3@v5 + if: inputs.build-generates-artifacts == 'true' && steps.build.outcome != 'skipped' + with: + name: ${{ inputs.build-environment }} + retention-days: 14 + if-no-files-found: error + path: artifacts.zip + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Upload sccache stats + if: steps.build.outcome != 'skipped' + uses: seemethere/upload-artifact-s3@v5 + with: + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact + retention-days: 365 + if-no-files-found: warn + path: sccache-stats-*.json + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.github/actions/linux-test/action.yml b/.github/actions/linux-test/action.yml new file mode 100644 index 0000000000000..6c8e761444b0a --- /dev/null +++ b/.github/actions/linux-test/action.yml @@ -0,0 +1,384 @@ +name: linux-test + +inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + docker-image: + required: true + type: string + description: Docker image to run in. + sync-tag: + required: false + type: string + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + use-gha: + required: false + type: string + default: "" + description: If set to any value, upload to GHA. Otherwise upload to S3. + dashboard-tag: + required: false + type: string + default: "" + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + HUGGING_FACE_HUB_TOKEN: + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + required: false + default: "" + GITHUB_TOKEN: + description: GitHub token + required: true + +#env: +# GIT_DEFAULT_BRANCH: ${{ inputs.default_branch }} + +runs: + using: composite + steps: + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: configure aws credentials + if : ${{ inputs.aws-role-to-assume != '' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: ${{ inputs.aws-role-to-assume }} + role-session-name: gha-linux-test + aws-region: us-east-1 + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: ${{ inputs.docker-image }} + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*/} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Check if in a ARC runner + shell: bash + id: check_arc_runner + run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + id: install-nvidia-driver + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} + + - name: Lock NVIDIA A100 40GB Frequency + shell: bash + run: | + sudo nvidia-smi -pm 1 + sudo nvidia-smi -ac 1215,1410 + nvidia-smi + if: contains(matrix.runner, 'a100') + + - name: Start monitoring script + id: monitor-script + shell: bash + continue-on-error: true + run: | + python3 -m pip install psutil==5.9.1 nvidia-ml-py==11.525.84 + python3 -m tools.stats.monitor > usage_log.txt 2>&1 & + echo "monitor-script-pid=${!}" >> "${GITHUB_OUTPUT}" + + - name: Download build artifacts + uses: ./.github/actions/download-build-artifacts + with: + name: ${{ inputs.build-environment }} + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Download TD artifacts + continue-on-error: true + uses: ./.github/actions/download-td-artifacts + + - name: Parse ref + id: parse-ref + shell: bash + run: .github/scripts/parse_ref.py + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ inputs.GITHUB_TOKEN }} + + - name: Check for keep-going label and re-enabled test issues + # This uses the filter-test-configs action because it conviniently + # checks for labels and re-enabled test issues. It does not actually do + # any filtering. All filtering is done in the build step. + id: keep-going + uses: ./.github/actions/filter-test-configs + with: + github-token: ${{ inputs.GITHUB_TOKEN }} + test-matrix: ${{ inputs.test-matrix }} + job-name: ${{ steps.get-job-id.outputs.job-name }} + + - name: Test + id: test + env: + BUILD_ENVIRONMENT: ${{ inputs.build-environment }} + PR_NUMBER: ${{ github.event.pull_request.number }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + BRANCH: ${{ steps.parse-ref.outputs.branch }} + SHA1: ${{ github.event.pull_request.head.sha || github.sha }} + BASE_SHA: ${{ github.event.pull_request.base.sha || github.sha }} + TEST_CONFIG: ${{ matrix.config }} + SHARD_NUMBER: ${{ matrix.shard }} + NUM_TEST_SHARDS: ${{ matrix.num_shards }} + REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} + CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} + TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} + SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 + SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} + SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} + DOCKER_IMAGE: ${{ inputs.docker-image }} + XLA_CUDA: ${{ contains(inputs.build-environment, 'xla') && '0' || '' }} + XLA_CLANG_CACHE_S3_BUCKET_NAME: ossci-compiler-clang-cache-circleci-xla + PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} + PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} + DASHBOARD_TAG: ${{ inputs.dashboard-tag }} + HUGGING_FACE_HUB_TOKEN: ${{ inputs.HUGGING_FACE_HUB_TOKEN }} + shell: bash + run: | + set -x + + if [[ $TEST_CONFIG == 'multigpu' ]]; then + TEST_COMMAND=.ci/pytorch/multigpu-test.sh + elif [[ $BUILD_ENVIRONMENT == *onnx* ]]; then + TEST_COMMAND=.ci/onnx/test.sh + else + TEST_COMMAND=.ci/pytorch/test.sh + fi + + # detached container should get cleaned up by teardown_ec2_linux + # TODO: Stop building test binaries as part of the build phase + # Used for GPU_FLAG since that doesn't play nice + # shellcheck disable=SC2086,SC2090 + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e BUILD_ENVIRONMENT \ + -e PR_NUMBER \ + -e GITHUB_ACTIONS \ + -e GITHUB_REPOSITORY \ + -e GITHUB_WORKFLOW \ + -e GITHUB_JOB \ + -e GITHUB_RUN_ID \ + -e GITHUB_RUN_NUMBER \ + -e GITHUB_RUN_ATTEMPT \ + -e JOB_ID \ + -e JOB_NAME \ + -e BASE_SHA \ + -e BRANCH \ + -e SHA1 \ + -e AWS_DEFAULT_REGION \ + -e IN_WHEEL_TEST \ + -e SHARD_NUMBER \ + -e TEST_CONFIG \ + -e NUM_TEST_SHARDS \ + -e REENABLED_ISSUES \ + -e CONTINUE_THROUGH_ERROR \ + -e VERBOSE_TEST_LOGS \ + -e NO_TEST_TIMEOUT \ + -e NO_TD \ + -e TD_DISTRIBUTED \ + -e PR_LABELS \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e SCCACHE_BUCKET \ + -e SCCACHE_S3_KEY_PREFIX \ + -e XLA_CUDA \ + -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ + -e PYTORCH_TEST_CUDA_MEM_LEAK_CHECK \ + -e PYTORCH_TEST_RERUN_DISABLED_TESTS \ + -e SKIP_SCCACHE_INITIALIZATION=1 \ + -e HUGGING_FACE_HUB_TOKEN \ + -e DASHBOARD_TAG \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --ipc=host \ + --shm-size="${SHM_SIZE}" \ + --tty \ + --detach \ + --name="${container_name}" \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + # Propagate download.pytorch.org IP to container + grep download.pytorch.org /etc/hosts | docker exec -i "${container_name}" sudo bash -c "/bin/cat >> /etc/hosts" + echo "DOCKER_CONTAINER_ID=${container_name}" >> "${GITHUB_ENV}" + docker exec -t "${container_name}" sh -c "pip install $(echo dist/*.whl)[opt-einsum] && ${TEST_COMMAND}" + + - name: Upload pytest cache if tests failed + uses: ./.github/actions/pytest-cache-upload + continue-on-error: true + if: failure() && steps.test.conclusion && steps.test.conclusion == 'failure' + with: + cache_dir: .pytest_cache + shard: ${{ matrix.shard }} + sha: ${{ github.event.pull_request.head.sha || github.sha }} + test_config: ${{ matrix.config }} + job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} + + - name: Print remaining test logs + shell: bash + if: always() && steps.test.conclusion + run: | + cat test/**/*_toprint.log || true + + - name: Stop monitoring script + if: always() && steps.monitor-script.outputs.monitor-script-pid + shell: bash + continue-on-error: true + env: + MONITOR_SCRIPT_PID: ${{ steps.monitor-script.outputs.monitor-script-pid }} + run: | + kill "$MONITOR_SCRIPT_PID" + + - name: Upload test artifacts + uses: ./.github/actions/upload-test-artifacts + if: always() && steps.test.conclusion && steps.test.conclusion != 'skipped' + with: + file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} + use-gha: ${{ inputs.use-gha }} + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Collect backtraces from coredumps (if any) + if: always() + shell: bash + run: | + # shellcheck disable=SC2156 + find . -iname "core.[1-9]*" -exec docker exec "${DOCKER_CONTAINER_ID}" sh -c "gdb python {} -ex 'bt' -ex 'q'" \; + + - name: Store Core dumps on S3 + uses: seemethere/upload-artifact-s3@v5 + if: failure() + with: + name: coredumps-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }} + retention-days: 14 + if-no-files-found: ignore + path: ./**/core.[1-9]* + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + + # NB: We are currently having an intermittent GPU-related issue on G5 runners with + # A10G GPU. Once this happens, trying to reset the GPU as done in setup-nvidia does + # not seem to help. Here are some symptoms: + # * Calling nvidia-smi timeouts after 60 second + # * Fail to run nvidia-smi with an unable to determine the device handle for GPU + # unknown error + # * Test fails with a missing CUDA GPU error when initializing CUDA in PyTorch + # * Run docker --gpus all fails with error response from daemon + # + # As both the root cause and recovery path are unclear, let's take the runner out of + # service so that it doesn't get any more jobs + - name: Check NVIDIA driver installation step + if: failure() && steps.install-nvidia-driver.outcome && steps.install-nvidia-driver.outcome != 'skipped' + shell: bash + env: + RUNNER_WORKSPACE: ${{ runner.workspace }} + run: | + set +e + set -x + + nvidia-smi + # NB: Surprisingly, nvidia-smi command returns successfully with return code 0 even in + # the case where the driver has already crashed as it still can get the driver version + # and some basic information like the bus ID. However, the rest of the information + # would be missing (ERR!), for example: + # + # +-----------------------------------------------------------------------------+ + # | NVIDIA-SMI 525.89.02 Driver Version: 525.89.02 CUDA Version: 12.0 | + # |-------------------------------+----------------------+----------------------+ + # | GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC | + # | Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. | + # | | | MIG M. | + # |===============================+======================+======================| + # | 0 ERR! Off | 00000000:00:1E.0 Off | ERR! | + # |ERR! ERR! ERR! ERR! / ERR! | 4184MiB / 23028MiB | ERR! Default | + # | | | ERR! | + # +-------------------------------+----------------------+----------------------+ + # + # +-----------------------------------------------------------------------------+ + # | Processes: | + # | GPU GI CI PID Type Process name GPU Memory | + # | ID ID Usage | + # |=============================================================================| + # +-----------------------------------------------------------------------------+ + # + # This should be reported as a failure instead as it will guarantee to fail when + # Docker tries to run with --gpus all + # + # So, the correct check here is to query one of the missing piece of info like + # GPU name, so that the command can fail accordingly + nvidia-smi --query-gpu=gpu_name --format=csv,noheader --id=0 + NVIDIA_SMI_STATUS=$? + + # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action + if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then + echo "NVIDIA driver installation has failed, shutting down the runner..." + .github/scripts/stop_runner_service.sh + fi + + # For runner with multiple GPUs, we also want to confirm that the number of GPUs are the + # power of 2, i.e. 1, 2, 4, or 8. This is to avoid flaky test issue when one GPU fails + # https://github.com/pytorch/test-infra/issues/4000 + GPU_COUNT=$(nvidia-smi --list-gpus | wc -l) + NVIDIA_SMI_STATUS=$? + + # These are acceptable return code from nvidia-smi as copied from setup-nvidia GitHub action + if [ "$NVIDIA_SMI_STATUS" -ne 0 ] && [ "$NVIDIA_SMI_STATUS" -ne 14 ]; then + echo "NVIDIA driver installation has failed, shutting down the runner..." + .github/scripts/stop_runner_service.sh + fi + + # Check the GPU count to be a power of 2 + if [ "$GPU_COUNT" -le 8 ] && [ "$GPU_COUNT" -ne 1 ] && [ "$GPU_COUNT" -ne 2 ] && [ "$GPU_COUNT" -ne 4 ] && [ "$GPU_COUNT" -ne 8 ]; then + echo "NVIDIA driver detects $GPU_COUNT GPUs. The runner has a broken GPU, shutting it down..." + .github/scripts/stop_runner_service.sh + fi diff --git a/.github/actions/pytest-cache-download/action.yml b/.github/actions/pytest-cache-download/action.yml index 1cf2f9f4ab57a..bbbeb9f43090d 100644 --- a/.github/actions/pytest-cache-download/action.yml +++ b/.github/actions/pytest-cache-download/action.yml @@ -9,6 +9,10 @@ inputs: job_identifier: description: Text that uniquely identifies a given job type within a workflow. All shards of a job should share the same job identifier. required: true + s3_bucket: + description: S3 bucket to download PyTest cache + required: false + default: "gha-artifacts" runs: using: composite @@ -30,6 +34,7 @@ runs: CACHE_DIR: ${{ inputs.cache_dir }} JOB_IDENTIFIER: ${{ inputs.job_identifier }} REPO: ${{ github.repository }} + BUCKET: ${{ inputs.s3_bucket }} run: | python3 .github/scripts/pytest_cache.py \ --download \ @@ -38,3 +43,4 @@ runs: --job_identifier $JOB_IDENTIFIER \ --temp_dir $RUNNER_TEMP \ --repo $REPO \ + --bucket $BUCKET \ diff --git a/.github/actions/setup-linux/action.yml b/.github/actions/setup-linux/action.yml index 98eab13c44077..193dc7d6fd93d 100644 --- a/.github/actions/setup-linux/action.yml +++ b/.github/actions/setup-linux/action.yml @@ -15,10 +15,12 @@ runs: category=$1 # If it is GCP runner (runner name contains gcp), do not run this runner_name_str=${{ runner.name }} - if [[ $runner_name_str != *"gcp"* ]]; then - curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" - else + if [[ -f /.inarc ]]; then + echo "ARC Runner, no info on ec2 metadata" + elif [[ $runner_name_str == *"gcp"* ]]; then echo "Runner is from Google Cloud Platform, No info on ec2 metadata" + else + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" fi } echo "ami-id: $(get_ec2_metadata ami-id)" @@ -26,8 +28,14 @@ runs: echo "instance-type: $(get_ec2_metadata instance-type)" echo "system info $(uname -a)" + - name: Check if in a ARC runner + shell: bash + id: check_arc_runner + run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> $GITHUB_OUTPUT + - name: Start docker if docker deamon is not running shell: bash + if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} run: | if systemctl is-active --quiet docker; then echo "Docker daemon is running..."; @@ -58,6 +66,7 @@ runs: env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}" - name: Kill any existing containers, clean up images + if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} shell: bash run: | # ignore expansion of "docker ps -q" since it could be empty @@ -96,3 +105,28 @@ runs: echo "${RESOLVED_IP} ${PT_DOMAIN}" | sudo tee -a /etc/hosts cat /etc/hosts + + - name: Check that the docker daemon is running + shell: bash + continue-on-error: true + if: ${{ steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'true' }} + run: | + set +x + + max_attempts=30 + delay=10 + attempt=1 + + for attempt in $(seq 1 $max_attempts); do + echo "Attempt $attempt of $max_attempts: Checking if Docker daemon is running..." + if docker info > /dev/null 2>&1; then + echo "Docker is running. Proceeding with the next steps" + exit 0 + else + echo "Docker is not running yet." + echo "Retrying in $delay seconds..." + sleep $delay + fi + done + echo "Reached maximum attempts to connect to Docker. Exiting." + exit 1 diff --git a/.github/actions/setup-rocm/action.yml b/.github/actions/setup-rocm/action.yml index b9833480954b9..232a1e33a9c86 100644 --- a/.github/actions/setup-rocm/action.yml +++ b/.github/actions/setup-rocm/action.yml @@ -9,6 +9,16 @@ runs: shell: bash run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}" + - name: Remove leftover Docker config file + shell: bash + continue-on-error: true + run: | + set -ex + + cat ~/.docker/config.json || true + # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not + rm -f ~/.docker/config.json + - name: Stop all running docker containers if: always() shell: bash diff --git a/.github/actions/update-commit-hash/action.yml b/.github/actions/update-commit-hash/action.yml deleted file mode 100644 index 5a21d592d78f7..0000000000000 --- a/.github/actions/update-commit-hash/action.yml +++ /dev/null @@ -1,59 +0,0 @@ -name: Update commit hash - -inputs: - repo-owner: - required: false - type: string - description: Name of repository's owner. - default: pytorch - repo-name: - required: true - type: string - description: Name of the repository we're updating commit hash for. - branch: - required: true - type: string - description: Branch to fetch commit of - pin-folder: - type: string - description: Path to folder with commit pin - required: false - default: .github/ci_commit_pins - updatebot-token: - required: true - type: string - description: update bot token - pytorchbot-token: - required: true - type: string - description: update bot token - -description: update commit hash - -runs: - using: composite - steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 1 - submodules: false - token: ${{ inputs.updatebot-token }} - - name: Checkout - shell: bash - run: | - git clone https://github.com/${{ inputs.repo-owner }}/${{ inputs.repo-name }}.git --quiet - - name: Check if there already exists a PR - shell: bash - env: - REPO_NAME: ${{ inputs.repo-name }} - BRANCH: ${{ inputs.branch }} - PIN_FOLDER: ${{ inputs.pin-folder }} - UPDATEBOT_TOKEN: ${{ inputs.updatebot-token }} - PYTORCHBOT_TOKEN: ${{ inputs.pytorchbot-token }} - NEW_BRANCH_NAME: update-${{ inputs.repo-name }}-commit-hash/${{ github.run_id }}-${{ github.run_number }}-${{ github.run_attempt }} - run: | - # put this here instead of the script to prevent accidentally changing the config when running the script locally - git config --global user.name "PyTorch UpdateBot" - git config --global user.email "pytorchupdatebot@users.noreply.github.com" - python .github/scripts/update_commit_hashes.py --repo-name "${REPO_NAME}" --branch "${BRANCH}" --pin-folder "${PIN_FOLDER}" diff --git a/.github/actions/upload-test-artifacts/action.yml b/.github/actions/upload-test-artifacts/action.yml index 87fa30a675861..04cb43b20c389 100644 --- a/.github/actions/upload-test-artifacts/action.yml +++ b/.github/actions/upload-test-artifacts/action.yml @@ -11,6 +11,10 @@ inputs: Suffix to add to the filename of the artifacts. This should include the workflow job id, see [Job id in artifacts]. required: true + s3-bucket: + description: S3 bucket to download builds + required: false + default: "gha-artifacts" runs: using: composite @@ -42,7 +46,7 @@ runs: env: FILE_SUFFIX: ${{ inputs.file-suffix }} run: | - # Remove any previous test reports if they exist + # Remove any previous usage logs if they exist rm -f logs-*.zip # this workflow is also run in bazel build test, but we dont generate usage reports for it # so check to see if the file exists first @@ -53,6 +57,18 @@ runs: zip -r "logs-${FILE_SUFFIX}.zip" test -i '*.log' fi + - name: Zip debugging artifacts for upload + if: runner.os != 'Windows' && !inputs.use-gha + shell: bash + env: + FILE_SUFFIX: ${{ inputs.file-suffix }} + run: | + # Remove any previous debugging artifacts if they exist + rm -f debug-*.zip + if [ -d 'test/debug' ]; then + zip -r "debug-${FILE_SUFFIX}.zip" test/debug + fi + # Windows zip - name: Zip JSONs for upload if: runner.os == 'Windows' && !inputs.use-gha @@ -87,6 +103,7 @@ runs: uses: seemethere/upload-artifact-s3@v5 if: ${{ !inputs.use-gha }} with: + s3-bucket: ${{ inputs.s3-bucket }} s3-prefix: | ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact retention-days: 14 @@ -97,6 +114,7 @@ runs: uses: seemethere/upload-artifact-s3@v5 if: ${{ !inputs.use-gha }} with: + s3-bucket: ${{ inputs.s3-bucket }} s3-prefix: | ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact retention-days: 14 @@ -108,12 +126,25 @@ runs: if: ${{ !inputs.use-gha }} continue-on-error: true with: + s3-bucket: ${{ inputs.s3-bucket }} s3-prefix: | ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact retention-days: 14 if-no-files-found: ignore path: logs-*.zip + - name: Store Debug Artifacts on S3 + uses: seemethere/upload-artifact-s3@v5 + if: ${{ !inputs.use-gha }} + continue-on-error: true + with: + s3-bucket: ${{ inputs.s3-bucket }} + s3-prefix: | + ${{ github.repository }}/${{ github.run_id }}/${{ github.run_attempt }}/artifact + retention-days: 14 + if-no-files-found: ignore + path: debug-*.zip + # GHA upload - name: Store Test Downloaded JSONs on Github uses: actions/upload-artifact@v3 diff --git a/.github/auto_request_review.yml b/.github/auto_request_review.yml index 05162f87ea5d7..3ec436d107622 100644 --- a/.github/auto_request_review.yml +++ b/.github/auto_request_review.yml @@ -6,7 +6,6 @@ reviewers: - albanD - miladm - bdhirsh - - voznesenskym per_author: symbolic-shapes: diff --git a/.github/ci_commit_pins/audio.txt b/.github/ci_commit_pins/audio.txt index 4145279c4cf16..9830a3ce9650e 100644 --- a/.github/ci_commit_pins/audio.txt +++ b/.github/ci_commit_pins/audio.txt @@ -1 +1 @@ -e3efbc2d9094685dd2d4ae143853941f82f167af +ea437b31ce316ea3d66fe73768c0dcb94edb79ad diff --git a/.github/ci_commit_pins/torchbench.txt b/.github/ci_commit_pins/torchbench.txt index f2d5071375cf6..3df9dd6cf8038 100644 --- a/.github/ci_commit_pins/torchbench.txt +++ b/.github/ci_commit_pins/torchbench.txt @@ -1 +1 @@ -99944a2fb8624947f9c0e2edc898ff42a16124da +d6015d42d9a1834bc7595c4bd6852562fb80b30b diff --git a/.github/ci_commit_pins/vision.txt b/.github/ci_commit_pins/vision.txt index afca955c6c27d..c642e5d08c80d 100644 --- a/.github/ci_commit_pins/vision.txt +++ b/.github/ci_commit_pins/vision.txt @@ -1 +1 @@ -d23430765b5df76cd1267f438f129f51b7d6e3e1 +d23a6e1664d20707c11781299611436e1f0c104f diff --git a/.github/ci_commit_pins/xla.txt b/.github/ci_commit_pins/xla.txt index 807efc6f54204..5258ab80f18a8 100644 --- a/.github/ci_commit_pins/xla.txt +++ b/.github/ci_commit_pins/xla.txt @@ -1 +1 @@ -e1c94dfa5a74331a376537c23bf74a2c367f24bd +e3fc03314dab5f44e3ed9ccbba6c15fbca3285cd diff --git a/.github/label_to_label.yml b/.github/label_to_label.yml new file mode 100644 index 0000000000000..e6c66a5e56cf6 --- /dev/null +++ b/.github/label_to_label.yml @@ -0,0 +1,13 @@ +# Use this to auto apply labels based on other labels. Applies to both PRs and +# issues. Currently only supports any and all +- any: + - "module: custom operators" + - "module: aotdispatch" + then: + - "module: pt2-dispatcher" +- any: + - "module: dynamo" + - "module: pt2-dispatcher" + - "module: inductor" + then: + - "oncall: pt2" diff --git a/.github/labeler.yml b/.github/labeler.yml index 4b7e5488e7881..f436ec684ffb9 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -26,10 +26,18 @@ - .github/ci_commit_pins/** - c10/core/Sym* - torch/fx/experimental/symbolic_shapes.py +- torch/fx/experimental/recording.py +- torch/fx/experimental/sym_node.py +- torch/fx/experimental/validator.py +- torch/fx/experimental/_sym_dispatch_mode.py +- torch/fx/experimental/proxy_tensor.py - test/distributed/_tensor/test_dtensor_compile.py - test/distributed/tensor/parallel/test_fsdp_2d_parallel.py - torch/distributed/_tensor/** - torch/distributed/fsdp/** +- torch/csrc/inductor/** +- test/cpp/aoti_abi_check/** +- test/cpp/aoti_inference/** "module: cpu": - aten/src/ATen/cpu/** @@ -39,6 +47,7 @@ - aten/src/ATen/native/mkldnn/** - torch/cpu/** - torch/utils/mkldnn.py +- torch/utils/_sympy/** - test/test_mkldnn.py "module: mkldnn": @@ -49,6 +58,17 @@ - third_party/mkl-dnn.BUILD - torch/csrc/jit/codegen/onednn/** - test/test_jit_llga_fuser.py +- test/test_mkldnn.py + +"ciflow/linux-aarch64": +- third_party/ideep +- caffe2/ideep/** +- caffe2/python/ideep/** +- cmake/Modules/FindMKLDNN.cmake +- third_party/mkl-dnn.BUILD +- torch/csrc/jit/codegen/onednn/** +- test/test_jit_llga_fuser.py +- test/test_mkldnn.py "module: amp (automated mixed precision)": - torch/amp/** diff --git a/.github/merge_rules.yaml b/.github/merge_rules.yaml index fec99bd9e1ffe..db0ec3c51aa79 100644 --- a/.github/merge_rules.yaml +++ b/.github/merge_rules.yaml @@ -28,12 +28,13 @@ - caffe2/python/onnx/** approved_by: - BowenBao - - abock - justinchuby + - liqunfu - shubhambhokare1 - thiagocrepaldi - titaiwangms - wschin + - xadupre mandatory_checks_name: - EasyCLA - Lint @@ -236,6 +237,23 @@ - Lint - pull +- name: XPU ATen + patterns: + - aten/src/ATen/xpu/** + - c10/xpu/** + - torch/csrc/xpu/** + - torch/xpu/** + - test/xpu/** + - third_party/xpu.txt + approved_by: + - EikanWang + - jgong5 + - gujinghui + mandatory_checks_name: + - EasyCLA + - Lint + - pull + - name: Distributions patterns: - torch/distributions/** @@ -275,17 +293,20 @@ - wanchaol - fduwjj - H-Huang - - aazzolini - kwen2501 - XilunWu - wz337 - awgu - fegin - - kumpera - - yhcharles + - kurman + - LucasLLC + - sanketpurandare + - shuqiangzhang + - tianyu-l - kiukchung - d4l3k - shuqiangzhang + - weifengpy mandatory_checks_name: - EasyCLA - Lint @@ -354,12 +375,14 @@ - name: CPU inductor patterns: + - torch/_inductor/mkldnn_lowerings.py - torch/_inductor/fx_passes/mkldnn_fusion.py - torch/_inductor/fx_passes/quantization.py - torch/_inductor/codegen/cpp.py - test/inductor/test_mkldnn_pattern_matcher.py - test/inductor/test_cpu_repo.py - test/inductor/test_cpu_cpp_wrapper.py + - aten/src/ATen/cpu/** - aten/src/ATen/native/quantized/cpu/** - test/quantization/core/test_quantized_op.py - torch/ao/quantization/quantizer/x86_inductor_quantizer.py diff --git a/.github/pytorch-probot.yml b/.github/pytorch-probot.yml index be63e07105349..4bf7526e79141 100644 --- a/.github/pytorch-probot.yml +++ b/.github/pytorch-probot.yml @@ -1,5 +1,6 @@ tracking_issue: 24422 ciflow_tracking_issue: 64124 +TD_rollout_issue: 123120 ciflow_push_tags: - ciflow/binaries - ciflow/binaries_conda @@ -7,6 +8,8 @@ ciflow_push_tags: - ciflow/binaries_wheel - ciflow/inductor - ciflow/inductor-perf-compare +- ciflow/inductor-micro-benchmark +- ciflow/linux-aarch64 - ciflow/mps - ciflow/nightly - ciflow/periodic @@ -15,9 +18,12 @@ ciflow_push_tags: - ciflow/trunk - ciflow/unstable - ciflow/xpu +- ciflow/torchbench retryable_workflows: - lint - pull - trunk - linux-binary - windows-binary +labeler_config: labeler.yml +label_to_label_config: label_to_label.yml diff --git a/.github/requirements-gha-cache.txt b/.github/requirements-gha-cache.txt index 80f04544225df..1064212c5b4ed 100644 --- a/.github/requirements-gha-cache.txt +++ b/.github/requirements-gha-cache.txt @@ -1,12 +1,11 @@ # This file is to cache other dependencies not specified elsewhere in: # requirement.txt -# requirements-flake8.txt # docs/requirements.txt # docs/cpp/requirements.txt # functorch/docs/requirements.txt # .ci/docker/requirements-ci.txt boto3==1.19.12 -jinja2==3.0.1 +jinja2==3.1.4 lintrunner==0.10.7 ninja==1.10.0.post1 nvidia-ml-py==11.525.84 diff --git a/.github/requirements/conda-env-Linux-X64.txt b/.github/requirements/conda-env-Linux-X64.txt index 43afafcd26011..16bbc57dd3be2 100644 --- a/.github/requirements/conda-env-Linux-X64.txt +++ b/.github/requirements/conda-env-Linux-X64.txt @@ -4,6 +4,6 @@ mkl-include=2022.1.0 ninja=1.10.2 numpy=1.23.3 pyyaml=6.0 -requests=2.28.1 -setuptools=65.5.0 +requests=2.31.0 +setuptools=68.2.2 typing-extensions=4.3.0 diff --git a/.github/requirements/conda-env-iOS.txt b/.github/requirements/conda-env-iOS.txt index cd94a40a21ab8..205c07925a016 100644 --- a/.github/requirements/conda-env-iOS.txt +++ b/.github/requirements/conda-env-iOS.txt @@ -3,6 +3,6 @@ cmake=3.22.1 ninja=1.10.2 numpy=1.23.3 pyyaml=6.0 -requests=2.28.1 -setuptools=63.4.1 +requests=2.31.0 +setuptools=68.2.2 typing-extensions=4.3.0 diff --git a/.github/requirements/pip-requirements-iOS.txt b/.github/requirements/pip-requirements-iOS.txt index 30e67abc5c863..01290e4c7102d 100644 --- a/.github/requirements/pip-requirements-iOS.txt +++ b/.github/requirements/pip-requirements-iOS.txt @@ -1,4 +1,4 @@ # iOS simulator requirements coremltools==5.0b5 protobuf==3.20.2 -optree==0.9.1 +optree==0.11.0 diff --git a/.github/requirements/pip-requirements-macOS.txt b/.github/requirements/pip-requirements-macOS.txt index 9b6986287391c..f0e4890328b35 100644 --- a/.github/requirements/pip-requirements-macOS.txt +++ b/.github/requirements/pip-requirements-macOS.txt @@ -16,7 +16,6 @@ pytest==7.3.2 pytest-xdist==3.3.1 pytest-rerunfailures==10.3 pytest-flakefinder==1.1.0 -pytest-shard==0.1.2 scipy==1.10.1 sympy==1.11.1 unittest-xml-reporting<=3.2.0,>=2.0.0 @@ -27,4 +26,7 @@ pytest-cpp==2.3.0 rockset==1.0.3 z3-solver==4.12.2.0 tensorboard==2.13.0 -optree==0.9.1 +optree==0.11.0 +# NB: test_hparams_* from test_tensorboard is failing with protobuf 5.26.0 in +# which the stringify metadata is wrong when escaping double quote +protobuf==3.20.2 diff --git a/.github/scripts/amd/package_triton_wheel.sh b/.github/scripts/amd/package_triton_wheel.sh new file mode 100755 index 0000000000000..4295a97a340e4 --- /dev/null +++ b/.github/scripts/amd/package_triton_wheel.sh @@ -0,0 +1,99 @@ +set -ex + +# Set ROCM_HOME isn't available, use ROCM_PATH if set or /opt/rocm +ROCM_HOME="${ROCM_HOME:-${ROCM_PATH:-/opt/rocm}}" + +# Find rocm_version.h header file for ROCm version extract +rocm_version_h="${ROCM_HOME}/include/rocm-core/rocm_version.h" +if [ ! -f "$rocm_version_h" ]; then + rocm_version_h="${ROCM_HOME}/include/rocm_version.h" +fi + +# Error out if rocm_version.h not found +if [ ! -f "$rocm_version_h" ]; then + echo "Error: rocm_version.h not found in expected locations." >&2 + exit 1 +fi + +# Extract major, minor and patch ROCm version numbers +MAJOR_VERSION=$(grep 'ROCM_VERSION_MAJOR' "$rocm_version_h" | awk '{print $3}') +MINOR_VERSION=$(grep 'ROCM_VERSION_MINOR' "$rocm_version_h" | awk '{print $3}') +PATCH_VERSION=$(grep 'ROCM_VERSION_PATCH' "$rocm_version_h" | awk '{print $3}') +ROCM_INT=$(($MAJOR_VERSION * 10000 + $MINOR_VERSION * 100 + $PATCH_VERSION)) +echo "ROCm version: $ROCM_INT" + +# Check TRITON_ROCM_DIR is set +if [[ -z "${TRITON_ROCM_DIR}" ]]; then + export TRITON_ROCM_DIR=third_party/amd/backend +fi + +# Remove packaged libs and headers +rm -rf $TRITON_ROCM_DIR/include/* + +LIBTINFO_PATH="/usr/lib64/libtinfo.so.5" +LIBNUMA_PATH="/usr/lib64/libnuma.so.1" +LIBELF_PATH="/usr/lib64/libelf.so.1" + +OS_SO_PATHS=( + $LIBELF_PATH + $LIBNUMA_PATH + $LIBTINFO_PATH +) + +for lib in "${OS_SO_PATHS[@]}" +do + cp $lib $TRITON_ROCM_DIR/lib/ +done + +# Required ROCm libraries +if [[ "${MAJOR_VERSION}" == "6" ]]; then + libamdhip="libamdhip64.so.6" +else + libamdhip="libamdhip64.so.5" +fi + +# Required ROCm libraries - ROCm 6.0 +ROCM_SO=( + "${libamdhip}" + "libhsa-runtime64.so.1" + "libamd_comgr.so.2" + "libdrm.so.2" + "libdrm_amdgpu.so.1" +) + +if [[ $ROCM_INT -ge 60100 ]]; then + ROCM_SO+=("librocprofiler-register.so.0") +fi + +for lib in "${ROCM_SO[@]}" +do + file_path=($(find $ROCM_HOME/lib/ -name "$lib")) # First search in lib + if [[ -z $file_path ]]; then + if [ -d "$ROCM_HOME/lib64/" ]; then + file_path=($(find $ROCM_HOME/lib64/ -name "$lib")) # Then search in lib64 + fi + fi + if [[ -z $file_path ]]; then + file_path=($(find $ROCM_HOME/ -name "$lib")) # Then search in ROCM_HOME + fi + if [[ -z $file_path ]]; then + file_path=($(find /opt/ -name "$lib")) # Then search in /opt + fi + if [[ -z $file_path ]]; then + echo "Error: Library file $lib is not found." >&2 + exit 1 + fi + + cp $file_path $TRITON_ROCM_DIR/lib + # When running locally, and not building a wheel, we need to satisfy shared objects requests that don't look for versions + LINKNAME=$(echo $lib | sed -e 's/\.so.*/.so/g') + ln -sf $lib $TRITON_ROCM_DIR/lib/$LINKNAME + +done + +# Copy Include Files +cp -r $ROCM_HOME/include/hip $TRITON_ROCM_DIR/include + +# Copy linker +mkdir -p $TRITON_ROCM_DIR/llvm/bin +cp $ROCM_HOME/llvm/bin/ld.lld $TRITON_ROCM_DIR/llvm/bin/ diff --git a/.github/scripts/amd/patch_triton_wheel.sh b/.github/scripts/amd/patch_triton_wheel.sh new file mode 100755 index 0000000000000..667fcb645587c --- /dev/null +++ b/.github/scripts/amd/patch_triton_wheel.sh @@ -0,0 +1,103 @@ +#!/bin/bash +set -x + +if [ -z "$1" ]; then + echo "Need wheel location argument" && exit 1 +fi + +WHEELHOUSE_DIR=$1 +PATCHELF_BIN=patchelf +ROCM_LIB=backends/amd/lib +ROCM_LD=backends/amd/llvm/bin +PREFIX=triton +fname_without_so_number() { + LINKNAME=$(echo $1 | sed -e 's/\.so.*/.so/g') + echo "$LINKNAME" +} + +replace_needed_sofiles() { + find $1 -name '*.so*' -o -name 'ld.lld' | while read sofile; do + origname=$2 + patchedname=$3 + if [[ "$origname" != "$patchedname" ]]; then + set +e + origname=$($PATCHELF_BIN --print-needed $sofile | grep "$origname.*") + ERRCODE=$? + set -e + if [ "$ERRCODE" -eq "0" ]; then + echo "patching $sofile entry $origname to $patchedname" + $PATCHELF_BIN --replace-needed $origname $patchedname $sofile + fi + fi + done +} + +mkdir -p "/tmp_dir" +pushd /tmp_dir +for pkg in /$WHEELHOUSE_DIR/*triton*.whl; do + echo "Modifying $pkg" + rm -rf tmp + mkdir -p tmp + cd tmp + cp $pkg . + unzip -q $(basename $pkg) + rm -f $(basename $pkg) + $PATCHELF_BIN --set-rpath ${LD_SO_RPATH:-'$ORIGIN:$ORIGIN/../../lib'} $PREFIX/$ROCM_LD/ld.lld + $PATCHELF_BIN --print-rpath $PREFIX/$ROCM_LD/ld.lld + # Modify libtriton.so as it sits in _C directory apart from its dependencies + find $PREFIX/_C -type f -name "*.so*" | while read sofile; do + echo "Setting rpath of $sofile" + $PATCHELF_BIN --set-rpath ${C_SO_RPATH:-'$ORIGIN:$ORIGIN/'../$ROCM_LIB} ${FORCE_RPATH:-} $sofile + $PATCHELF_BIN --print-rpath $sofile + done + + # All included dependencies are included in a single lib directory + deps=() + deps_soname=() + while read sofile; do + echo "Setting rpath of $sofile to ${LIB_SO_RPATH:-'$ORIGIN'}" + $PATCHELF_BIN --set-rpath ${LIB_SO_RPATH:-'$ORIGIN'} ${FORCE_RPATH:-} $sofile + $PATCHELF_BIN --print-rpath $sofile + deps+=("$sofile") + deps_soname+=("$(basename $sofile)") + done < <(find $PREFIX/$ROCM_LIB -type f -name "*.so*") + + patched=() + for filepath in "${deps[@]}"; do + filename=$(basename $filepath) + destpath=$PREFIX/$ROCM_LIB/$filename + if [[ "$filepath" != "$destpath" ]]; then + cp $filepath $destpath + fi + patchedpath=$(fname_without_so_number $destpath) + patchedname=$(basename $patchedpath) + if [[ "$destpath" != "$patchedpath" ]]; then + mv $destpath $patchedpath + fi + patched+=("$patchedname") + echo "Copied $filepath to $patchedpath" + done + + # Go through all required shared objects and see if any of our other objects are dependants. If so, replace so.ver wth so + for ((i=0;i<${#deps[@]};++i)); do + echo "replacing "${deps_soname[i]} ${patched[i]} + replace_needed_sofiles $PREFIX/$ROCM_LIB ${deps_soname[i]} ${patched[i]} + replace_needed_sofiles $PREFIX/_C ${deps_soname[i]} ${patched[i]} + replace_needed_sofiles $PREFIX/$ROCM_LD ${deps_soname[i]} ${patched[i]} + done + + # Re-bundle whl with so adjustments + zip -rqy $(basename $pkg) * + + if [[ -z "${MANYLINUX_VERSION}" ]]; then + newpkg=$pkg + else + newpkg=$(echo $pkg | sed -e "s/\linux_x86_64/${MANYLINUX_VERSION}/g") + fi + + # Remove original whl + rm -f $pkg + + # Move rebuilt whl to original location with new name. + mv $(basename $pkg) $newpkg +done diff --git a/.github/scripts/build_triton_wheel.py b/.github/scripts/build_triton_wheel.py index 693d6892ff592..f422f6766cc40 100644 --- a/.github/scripts/build_triton_wheel.py +++ b/.github/scripts/build_triton_wheel.py @@ -10,9 +10,6 @@ SCRIPT_DIR = Path(__file__).parent REPO_DIR = SCRIPT_DIR.parent.parent -# TODO: Remove me once Triton version is again in sync for vanilla and ROCm -ROCM_TRITION_VERSION = "2.1.0" - def read_triton_pin(rocm_hash: bool = False) -> str: triton_file = "triton.txt" if not rocm_hash else "triton-rocm.txt" @@ -32,27 +29,6 @@ def check_and_replace(inp: str, src: str, dst: str) -> str: return inp.replace(src, dst) -def patch_setup_py( - path: Path, - *, - version: str, - name: str = "triton", - expected_version: Optional[str] = None, -) -> None: - with open(path) as f: - orig = f.read() - # Replace name - orig = check_and_replace(orig, 'name="triton",', f'name="{name}",') - # Replace version - if not expected_version: - expected_version = read_triton_version() - orig = check_and_replace( - orig, f'version="{expected_version}",', f'version="{version}",' - ) - with open(path, "w") as f: - f.write(orig) - - def patch_init_py( path: Path, *, version: str, expected_version: Optional[str] = None ) -> None: @@ -92,14 +68,20 @@ def build_triton( with TemporaryDirectory() as tmpdir: triton_basedir = Path(tmpdir) / "triton" triton_pythondir = triton_basedir / "python" + triton_repo = "https://github.com/openai/triton" if build_rocm: - triton_repo = "https://github.com/ROCmSoftwarePlatform/triton" triton_pkg_name = "pytorch-triton-rocm" else: - triton_repo = "https://github.com/openai/triton" triton_pkg_name = "pytorch-triton" check_call(["git", "clone", triton_repo], cwd=tmpdir) - check_call(["git", "checkout", commit_hash], cwd=triton_basedir) + if release: + ver, rev, patch = version.split(".") + check_call( + ["git", "checkout", f"release/{ver}.{rev}.x"], cwd=triton_basedir + ) + else: + check_call(["git", "checkout", commit_hash], cwd=triton_basedir) + if build_conda: with open(triton_basedir / "meta.yaml", "w") as meta: print( @@ -109,7 +91,7 @@ def build_triton( print("source:\n path: .\n", file=meta) print( "build:\n string: py{{py}}\n number: 1\n script: cd python; " - "python setup.py install --single-version-externally-managed --record=record.txt\n", + "python setup.py install --record=record.txt\n", " script_env:\n - MAX_JOBS\n", file=meta, ) @@ -155,18 +137,15 @@ def build_triton( patch_init_py( triton_pythondir / "triton" / "__init__.py", version=f"{version}", - expected_version=ROCM_TRITION_VERSION if build_rocm else None, + expected_version=None, ) if build_rocm: - # TODO: Remove me when ROCM triton is updated - patch_setup_py( - triton_pythondir / "setup.py", - name=triton_pkg_name, - version=f"{version}", - expected_version=ROCM_TRITION_VERSION, + check_call( + [f"{SCRIPT_DIR}/amd/package_triton_wheel.sh"], + cwd=triton_basedir, + shell=True, ) - check_call("scripts/amd/setup_rocm_libs.sh", cwd=triton_basedir, shell=True) print("ROCm libraries setup for triton installation...") check_call( @@ -177,7 +156,10 @@ def build_triton( shutil.copy(whl_path, Path.cwd()) if build_rocm: - check_call("scripts/amd/fix_so.sh", cwd=triton_basedir, shell=True) + check_call( + [f"{SCRIPT_DIR}/amd/patch_triton_wheel.sh", Path.cwd()], + cwd=triton_basedir, + ) return Path.cwd() / whl_path.name diff --git a/.github/scripts/cherry_pick.py b/.github/scripts/cherry_pick.py new file mode 100755 index 0000000000000..4c892de21da8a --- /dev/null +++ b/.github/scripts/cherry_pick.py @@ -0,0 +1,223 @@ +#!/usr/bin/env python3 + +import json +import os +import re +from typing import Any, Optional + +from urllib.error import HTTPError + +from github_utils import gh_fetch_url, gh_post_pr_comment + +from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo +from trymerge import get_pr_commit_sha, GitHubPR + + +# This is only a suggestion for now, not a strict requirement +REQUIRES_ISSUE = { + "regression", + "critical", + "fixnewfeature", +} + + +def parse_args() -> Any: + from argparse import ArgumentParser + + parser = ArgumentParser("cherry pick a landed PR onto a release branch") + parser.add_argument( + "--onto-branch", type=str, required=True, help="the target release branch" + ) + parser.add_argument( + "--github-actor", type=str, required=True, help="all the world's a stage" + ) + parser.add_argument( + "--classification", + choices=["regression", "critical", "fixnewfeature", "docs", "release"], + required=True, + help="the cherry pick category", + ) + parser.add_argument("pr_num", type=int) + parser.add_argument( + "--fixes", + type=str, + default="", + help="the GitHub issue that the cherry pick fixes", + ) + parser.add_argument("--dry-run", action="store_true") + + return parser.parse_args() + + +def get_merge_commit_sha(repo: GitRepo, pr: GitHubPR) -> Optional[str]: + """ + Return the merge commit SHA iff the PR has been merged. For simplicity, we + will only cherry pick PRs that have been merged into main + """ + commit_sha = get_pr_commit_sha(repo, pr) + return commit_sha if pr.is_closed() else None + + +def cherry_pick( + github_actor: str, + repo: GitRepo, + pr: GitHubPR, + commit_sha: str, + onto_branch: str, + classification: str, + fixes: str, + dry_run: bool = False, +) -> None: + """ + Create a local branch to cherry pick the commit and submit it as a pull request + """ + current_branch = repo.current_branch() + cherry_pick_branch = create_cherry_pick_branch( + github_actor, repo, pr, commit_sha, onto_branch + ) + + try: + if not dry_run: + org, project = repo.gh_owner_and_name() + cherry_pick_pr = submit_pr(repo, pr, cherry_pick_branch, onto_branch) + + msg = f"The cherry pick PR is at {cherry_pick_pr}" + if fixes: + msg += f" and it is linked with issue {fixes}" + elif classification in REQUIRES_ISSUE: + msg += f" and it is recommended to link a {classification} cherry pick PR with an issue" + + post_comment(org, project, pr.pr_num, msg) + + finally: + if current_branch: + repo.checkout(branch=current_branch) + + +def create_cherry_pick_branch( + github_actor: str, repo: GitRepo, pr: GitHubPR, commit_sha: str, onto_branch: str +) -> str: + """ + Create a local branch and cherry pick the commit. Return the name of the local + cherry picking branch. + """ + repo.checkout(branch=onto_branch) + repo._run_git("submodule", "update", "--init", "--recursive") + + # Remove all special characters if we want to include the actor in the branch name + github_actor = re.sub("[^0-9a-zA-Z]+", "_", github_actor) + + cherry_pick_branch = f"cherry-pick-{pr.pr_num}-by-{github_actor}" + repo.create_branch_and_checkout(branch=cherry_pick_branch) + + # We might want to support ghstack later + repo._run_git("cherry-pick", "-x", "-X", "theirs", commit_sha) + repo.push(branch=cherry_pick_branch, dry_run=False) + + return cherry_pick_branch + + +def submit_pr( + repo: GitRepo, + pr: GitHubPR, + cherry_pick_branch: str, + onto_branch: str, +) -> str: + """ + Submit the cherry pick PR and return the link to the PR + """ + org, project = repo.gh_owner_and_name() + + default_msg = f"Cherry pick #{pr.pr_num} onto {onto_branch} branch" + title = pr.info.get("title", default_msg) + body = pr.info.get("body", default_msg) + + try: + response = gh_fetch_url( + f"https://api.github.com/repos/{org}/{project}/pulls", + method="POST", + data={ + "title": title, + "body": body, + "head": cherry_pick_branch, + "base": onto_branch, + }, + headers={"Accept": "application/vnd.github.v3+json"}, + reader=json.load, + ) + + cherry_pick_pr = response.get("html_url", "") + if not cherry_pick_pr: + raise RuntimeError( + f"Fail to find the cherry pick PR: {json.dumps(response)}" + ) + + return str(cherry_pick_pr) + + except HTTPError as error: + msg = f"Fail to submit the cherry pick PR: {error}" + raise RuntimeError(msg) from error + + +def post_comment(org: str, project: str, pr_num: int, msg: str) -> None: + """ + Post a comment on the PR itself to point to the cherry picking PR when success + or print the error when failure + """ + internal_debugging = "" + + run_url = os.getenv("GH_RUN_URL") + # Post a comment to tell folks that the PR is being cherry picked + if run_url is not None: + internal_debugging = "\n".join( + line + for line in ( + "
Details for Dev Infra team", + f'Raised by workflow job\n', + "
", + ) + if line + ) + + comment = "\n".join( + (f"### Cherry picking #{pr_num}", f"{msg}", "", f"{internal_debugging}") + ) + gh_post_pr_comment(org, project, pr_num, comment) + + +def main() -> None: + args = parse_args() + pr_num = args.pr_num + + repo = GitRepo(get_git_repo_dir(), get_git_remote_name()) + org, project = repo.gh_owner_and_name() + + pr = GitHubPR(org, project, pr_num) + + try: + commit_sha = get_merge_commit_sha(repo, pr) + if not commit_sha: + raise RuntimeError( + f"Refuse to cherry pick #{pr_num} because it hasn't been merged yet" + ) + + cherry_pick( + args.github_actor, + repo, + pr, + commit_sha, + args.onto_branch, + args.classification, + args.fixes, + args.dry_run, + ) + + except RuntimeError as error: + if not args.dry_run: + post_comment(org, project, pr_num, str(error)) + else: + raise error + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/comment_on_pr.py b/.github/scripts/comment_on_pr.py index 88edcce27cd5d..57fce4bf97399 100644 --- a/.github/scripts/comment_on_pr.py +++ b/.github/scripts/comment_on_pr.py @@ -23,8 +23,10 @@ def main() -> None: job_link = f"[job]({run_url})" if run_url is not None else "job" msg = ( - f"The {args.action} {job_link} was canceled. If you believe this is a mistake," - + f" then you can re trigger it through [pytorch-bot]({BOT_COMMANDS_WIKI})." + f"The {args.action} {job_link} was canceled or timed out. This most often happen if two merge requests were issued" + + " for the same PR, or if merge job was waiting for more than 6 hours for tests to finish." + + " In later case, please do not hesitate to reissue the merge command\n" + + f" For more information see [pytorch-bot wiki]({BOT_COMMANDS_WIKI})." ) gh_post_pr_comment(org, project, args.pr_num, msg) diff --git a/.github/scripts/delete_old_branches.py b/.github/scripts/delete_old_branches.py new file mode 100644 index 0000000000000..21b86fefa1a89 --- /dev/null +++ b/.github/scripts/delete_old_branches.py @@ -0,0 +1,274 @@ +# Delete old branches +import os +import re +from datetime import datetime +from pathlib import Path +from typing import Any, Callable, Dict, List, Set + +from github_utils import gh_fetch_json_dict, gh_graphql +from gitutils import GitRepo + +SEC_IN_DAY = 24 * 60 * 60 +CLOSED_PR_RETENTION = 30 * SEC_IN_DAY +NO_PR_RETENTION = 1.5 * 365 * SEC_IN_DAY +PR_WINDOW = 90 * SEC_IN_DAY # Set to None to look at all PRs (may take a lot of tokens) +REPO_OWNER = "pytorch" +REPO_NAME = "pytorch" +ESTIMATED_TOKENS = [0] + +TOKEN = os.environ["GITHUB_TOKEN"] +if not TOKEN: + raise Exception("GITHUB_TOKEN is not set") # noqa: TRY002 + +REPO_ROOT = Path(__file__).parent.parent.parent + +# Query for all PRs instead of just closed/merged because it's faster +GRAPHQL_ALL_PRS_BY_UPDATED_AT = """ +query ($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + pullRequests( + first: 100 + after: $cursor + orderBy: {field: UPDATED_AT, direction: DESC} + ) { + totalCount + pageInfo { + hasNextPage + endCursor + } + nodes { + headRefName + number + updatedAt + state + } + } + } +} +""" + +GRAPHQL_OPEN_PRS = """ +query ($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + pullRequests( + first: 100 + after: $cursor + states: [OPEN] + ) { + totalCount + pageInfo { + hasNextPage + endCursor + } + nodes { + headRefName + number + updatedAt + state + } + } + } +} +""" + +GRAPHQL_NO_DELETE_BRANCH_LABEL = """ +query ($owner: String!, $repo: String!, $cursor: String) { + repository(owner: $owner, name: $repo) { + label(name: "no-delete-branch") { + pullRequests(first: 100, after: $cursor) { + totalCount + pageInfo { + hasNextPage + endCursor + } + nodes { + headRefName + number + updatedAt + state + } + } + } + } +} +""" + + +def is_protected(branch: str) -> bool: + try: + ESTIMATED_TOKENS[0] += 1 + res = gh_fetch_json_dict( + f"https://api.github.com/repos/{REPO_OWNER}/{REPO_NAME}/branches/{branch}" + ) + return bool(res["protected"]) + except Exception as e: + print(f"[{branch}] Failed to fetch branch protections: {e}") + return True + + +def convert_gh_timestamp(date: str) -> float: + return datetime.strptime(date, "%Y-%m-%dT%H:%M:%SZ").timestamp() + + +def get_branches(repo: GitRepo) -> Dict[str, Any]: + # Query locally for branches, group by branch base name (e.g. gh/blah/base -> gh/blah), and get the most recent branch + git_response = repo._run_git( + "for-each-ref", + "--sort=creatordate", + "--format=%(refname) %(committerdate:iso-strict)", + "refs/remotes/origin", + ) + branches_by_base_name: Dict[str, Any] = {} + for line in git_response.splitlines(): + branch, date = line.split(" ") + re_branch = re.match(r"refs/remotes/origin/(.*)", branch) + assert re_branch + branch = branch_base_name = re_branch.group(1) + if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch): + branch_base_name = x.group(1) + date = datetime.fromisoformat(date).timestamp() + if branch_base_name not in branches_by_base_name: + branches_by_base_name[branch_base_name] = [date, [branch]] + else: + branches_by_base_name[branch_base_name][1].append(branch) + if date > branches_by_base_name[branch_base_name][0]: + branches_by_base_name[branch_base_name][0] = date + return branches_by_base_name + + +def paginate_graphql( + query: str, + kwargs: Dict[str, Any], + termination_func: Callable[[List[Dict[str, Any]]], bool], + get_data: Callable[[Dict[str, Any]], List[Dict[str, Any]]], + get_page_info: Callable[[Dict[str, Any]], Dict[str, Any]], +) -> List[Any]: + hasNextPage = True + endCursor = None + data: List[Dict[str, Any]] = [] + while hasNextPage: + ESTIMATED_TOKENS[0] += 1 + res = gh_graphql(query, cursor=endCursor, **kwargs) + data.extend(get_data(res)) + hasNextPage = get_page_info(res)["hasNextPage"] + endCursor = get_page_info(res)["endCursor"] + if termination_func(data): + break + return data + + +def get_recent_prs() -> Dict[str, Any]: + now = datetime.now().timestamp() + + # Grab all PRs updated in last CLOSED_PR_RETENTION days + pr_infos: List[Dict[str, Any]] = paginate_graphql( + GRAPHQL_ALL_PRS_BY_UPDATED_AT, + {"owner": "pytorch", "repo": "pytorch"}, + lambda data: ( + PR_WINDOW is not None + and (now - convert_gh_timestamp(data[-1]["updatedAt"]) > PR_WINDOW) + ), + lambda res: res["data"]["repository"]["pullRequests"]["nodes"], + lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"], + ) + + # Get the most recent PR for each branch base (group gh together) + prs_by_branch_base = {} + for pr in pr_infos: + pr["updatedAt"] = convert_gh_timestamp(pr["updatedAt"]) + branch_base_name = pr["headRefName"] + if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name): + branch_base_name = x.group(1) + if branch_base_name not in prs_by_branch_base: + prs_by_branch_base[branch_base_name] = pr + else: + if pr["updatedAt"] > prs_by_branch_base[branch_base_name]["updatedAt"]: + prs_by_branch_base[branch_base_name] = pr + return prs_by_branch_base + + +def get_branches_with_magic_label_or_open_pr() -> Set[str]: + pr_infos: List[Dict[str, Any]] = paginate_graphql( + GRAPHQL_NO_DELETE_BRANCH_LABEL, + {"owner": "pytorch", "repo": "pytorch"}, + lambda data: False, + lambda res: res["data"]["repository"]["label"]["pullRequests"]["nodes"], + lambda res: res["data"]["repository"]["label"]["pullRequests"]["pageInfo"], + ) + + pr_infos.extend( + paginate_graphql( + GRAPHQL_OPEN_PRS, + {"owner": "pytorch", "repo": "pytorch"}, + lambda data: False, + lambda res: res["data"]["repository"]["pullRequests"]["nodes"], + lambda res: res["data"]["repository"]["pullRequests"]["pageInfo"], + ) + ) + + # Get the most recent PR for each branch base (group gh together) + branch_bases = set() + for pr in pr_infos: + branch_base_name = pr["headRefName"] + if x := re.match(r"(gh\/.+)\/(head|base|orig)", branch_base_name): + branch_base_name = x.group(1) + branch_bases.add(branch_base_name) + return branch_bases + + +def delete_branch(repo: GitRepo, branch: str) -> None: + repo._run_git("push", "origin", "-d", branch) + + +def delete_branches() -> None: + now = datetime.now().timestamp() + git_repo = GitRepo(str(REPO_ROOT), "origin", debug=True) + branches = get_branches(git_repo) + prs_by_branch = get_recent_prs() + keep_branches = get_branches_with_magic_label_or_open_pr() + + delete = [] + # Do not delete if: + # * associated PR is open, closed but updated recently, or contains the magic string + # * no associated PR and branch was updated in last 1.5 years + # * is protected + # Setting different values of PR_WINDOW will change how branches with closed + # PRs are treated depending on how old the branch is. The default value of + # 90 will allow branches with closed PRs to be deleted if the PR hasn't been + # updated in 90 days and the branch hasn't been updated in 1.5 years + for base_branch, (date, sub_branches) in branches.items(): + print(f"[{base_branch}] Updated {(now - date) / SEC_IN_DAY} days ago") + if base_branch in keep_branches: + print(f"[{base_branch}] Has magic label or open PR, skipping") + continue + pr = prs_by_branch.get(base_branch) + if pr: + print( + f"[{base_branch}] Has PR {pr['number']}: {pr['state']}, updated {(now - pr['updatedAt']) / SEC_IN_DAY} days ago" + ) + if ( + now - pr["updatedAt"] < CLOSED_PR_RETENTION + or (now - date) < CLOSED_PR_RETENTION + ): + continue + elif now - date < NO_PR_RETENTION: + continue + print(f"[{base_branch}] Checking for branch protections") + if any(is_protected(sub_branch) for sub_branch in sub_branches): + print(f"[{base_branch}] Is protected") + continue + for sub_branch in sub_branches: + print(f"[{base_branch}] Deleting {sub_branch}") + delete.append(sub_branch) + if ESTIMATED_TOKENS[0] > 400: + print("Estimated tokens exceeded, exiting") + break + + print(f"To delete ({len(delete)}):") + for branch in delete: + print(f"About to delete branch {branch}") + delete_branch(git_repo, branch) + + +if __name__ == "__main__": + delete_branches() diff --git a/.github/scripts/drci_mocks.json.gz b/.github/scripts/drci_mocks.json.gz index 5e272a1493a4f..a4c1db752cb09 100644 Binary files a/.github/scripts/drci_mocks.json.gz and b/.github/scripts/drci_mocks.json.gz differ diff --git a/.github/scripts/fetch_latest_green_commit.py b/.github/scripts/fetch_latest_green_commit.py deleted file mode 100644 index 1f0cd91233b98..0000000000000 --- a/.github/scripts/fetch_latest_green_commit.py +++ /dev/null @@ -1,139 +0,0 @@ -import os -import re -import sys -from typing import Any, cast, Dict, List, NamedTuple, Tuple - -import rockset # type: ignore[import] -from gitutils import _check_output - - -def eprint(msg: str) -> None: - print(msg, file=sys.stderr) - - -class WorkflowCheck(NamedTuple): - workflowName: str - name: str - jobName: str - conclusion: str - - -def get_latest_commits() -> List[str]: - latest_viable_commit = _check_output( - [ - "git", - "log", - "-n", - "1", - "--pretty=format:%H", - "origin/viable/strict", - ], - encoding="ascii", - ) - commits = _check_output( - [ - "git", - "rev-list", - f"{latest_viable_commit}^..HEAD", - "--remotes=*origin/main", - ], - encoding="ascii", - ).splitlines() - - return commits - - -def query_commits(commits: List[str]) -> List[Dict[str, Any]]: - rs = rockset.RocksetClient( - host="api.usw2a1.rockset.com", api_key=os.environ["ROCKSET_API_KEY"] - ) - params = [{"name": "shas", "type": "string", "value": ",".join(commits)}] - res = rs.QueryLambdas.execute_query_lambda( - # https://console.rockset.com/lambdas/details/commons.commit_jobs_batch_query - query_lambda="commit_jobs_batch_query", - version="19c74e10819104f9", - workspace="commons", - parameters=params, - ) - - return cast(List[Dict[str, Any]], res.results) - - -def print_commit_status(commit: str, results: Dict[str, Any]) -> None: - print(commit) - for check in results["results"]: - if check["sha"] == commit: - print(f"\t{check['conclusion']:>10}: {check['name']}") - - -def get_commit_results( - commit: str, results: List[Dict[str, Any]] -) -> List[Dict[str, Any]]: - workflow_checks = [] - for check in results: - if check["sha"] == commit: - workflow_checks.append( - WorkflowCheck( - workflowName=check["workflowName"], - name=check["name"], - jobName=check["jobName"], - conclusion=check["conclusion"], - )._asdict() - ) - return workflow_checks - - -def isGreen(commit: str, results: List[Dict[str, Any]]) -> Tuple[bool, str]: - workflow_checks = get_commit_results(commit, results) - - regex = { - "pull": False, - "trunk": False, - "lint": False, - "linux-binary": False, - } - - for check in workflow_checks: - jobName = check["jobName"] - # Ignore result from unstable job, be it success or failure - if "unstable" in jobName: - continue - - workflowName = check["workflowName"] - conclusion = check["conclusion"] - for required_check in regex: - if re.match(required_check, workflowName, flags=re.IGNORECASE): - if conclusion not in ["success", "skipped"]: - return (False, workflowName + " checks were not successful") - else: - regex[required_check] = True - - missing_workflows = [x for x in regex.keys() if not regex[x]] - if len(missing_workflows) > 0: - return (False, "missing required workflows: " + ", ".join(missing_workflows)) - - return (True, "") - - -def get_latest_green_commit(commits: List[str], results: List[Dict[str, Any]]) -> Any: - for commit in commits: - eprint(f"Checking {commit}") - is_green, msg = isGreen(commit, results) - if is_green: - eprint("GREEN") - return commit - else: - eprint("RED: " + msg) - return None - - -def main() -> None: - commits = get_latest_commits() - results = query_commits(commits) - - latest_viable_commit = get_latest_green_commit(commits, results) - print(latest_viable_commit) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/filter_test_configs.py b/.github/scripts/filter_test_configs.py index c25b0f6fe84d0..c2e45bac81100 100755 --- a/.github/scripts/filter_test_configs.py +++ b/.github/scripts/filter_test_configs.py @@ -1,6 +1,7 @@ #!/usr/bin/env python3 import json +import logging import os import re import subprocess @@ -8,6 +9,7 @@ import warnings from enum import Enum from functools import lru_cache +from logging import info from typing import Any, Callable, Dict, List, Optional, Set from urllib.request import Request, urlopen @@ -17,33 +19,7 @@ PREFIX = "test-config/" -# Same as shard names -VALID_TEST_CONFIG_LABELS = { - f"{PREFIX}{label}" - for label in { - "backwards_compat", - "crossref", - "default", - "deploy", - "distributed", - "docs_tests", - "dynamo", - "force_on_cpu", - "functorch", - "inductor", - "inductor_distributed", - "inductor_huggingface", - "inductor_timm", - "inductor_torchbench", - "jit_legacy", - "multigpu", - "nogpu_AVX512", - "nogpu_NO_AVX2", - "slow", - "tsan", - "xla", - } -} +logging.basicConfig(level=logging.INFO) def is_cuda_or_rocm_job(job_name: Optional[str]) -> bool: @@ -90,6 +66,12 @@ def parse_args() -> Any: parser.add_argument( "--test-matrix", type=str, required=True, help="the original test matrix" ) + parser.add_argument( + "--selected-test-configs", + type=str, + default="", + help="a comma-separated list of test configurations from the test matrix to keep", + ) parser.add_argument( "--workflow", type=str, help="the name of the current workflow, i.e. pull" ) @@ -155,19 +137,25 @@ def get_labels(pr_number: int) -> Set[str]: } +def filter_labels(labels: Set[str], label_regex: Any) -> Set[str]: + """ + Return the list of matching labels + """ + return {l for l in labels if re.match(label_regex, l)} + + def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, List[Any]]: """ Select the list of test config to run from the test matrix. The logic works as follows: - If the PR has one or more labels as specified in the VALID_TEST_CONFIG_LABELS set, only - these test configs will be selected. This also works with ciflow labels, for example, - if a PR has both ciflow/trunk and test-config/functorch, only trunk functorch builds - and tests will be run + If the PR has one or more test-config labels as specified, only these test configs + will be selected. This also works with ciflow labels, for example, if a PR has both + ciflow/trunk and test-config/functorch, only trunk functorch builds and tests will + be run. If the PR has none of the test-config label, all tests are run as usual. """ - filtered_test_matrix: Dict[str, List[Any]] = {"include": []} for entry in test_matrix.get("include", []): @@ -177,23 +165,46 @@ def filter(test_matrix: Dict[str, List[Any]], labels: Set[str]) -> Dict[str, Lis label = f"{PREFIX}{config_name.strip()}" if label in labels: - print( - f"Select {config_name} because label {label} is presented in the pull request by the time the test starts" - ) + msg = f"Select {config_name} because label {label} is present in the pull request by the time the test starts" + info(msg) filtered_test_matrix["include"].append(entry) - valid_test_config_labels = labels.intersection(VALID_TEST_CONFIG_LABELS) - - if not filtered_test_matrix["include"] and not valid_test_config_labels: - # Found no valid label and the filtered test matrix is empty, return the same + test_config_labels = filter_labels(labels, re.compile(f"{PREFIX}.+")) + if not filtered_test_matrix["include"] and not test_config_labels: + info("Found no test-config label on the PR, so all test configs are included") + # Found no test-config label and the filtered test matrix is empty, return the same # test matrix as before so that all tests can be run normally return test_matrix else: + msg = f"Found {test_config_labels} on the PR so only these test configs are run" + info(msg) # When the filter test matrix contain matches or if a valid test config label # is found in the PR, return the filtered test matrix return filtered_test_matrix +def filter_selected_test_configs( + test_matrix: Dict[str, List[Any]], selected_test_configs: Set[str] +) -> Dict[str, List[Any]]: + """ + Keep only the selected configs if the list if not empty. Otherwise, keep all test configs. + This filter is used when the workflow is dispatched manually. + """ + if not selected_test_configs: + return test_matrix + + filtered_test_matrix: Dict[str, List[Any]] = {"include": []} + for entry in test_matrix.get("include", []): + config_name = entry.get("config", "") + if not config_name: + continue + + if config_name in selected_test_configs: + filtered_test_matrix["include"].append(entry) + + return filtered_test_matrix + + def set_periodic_modes( test_matrix: Dict[str, List[Any]], job_name: Optional[str] ) -> Dict[str, List[Any]]: @@ -374,30 +385,33 @@ def process_jobs( # - If the target record has the job (config) name, only that test config # will be skipped or marked as unstable if not target_job_cfg: - print( + msg = ( f"Issue {target_url} created by {author} has {issue_type.value} " + f"all CI jobs for {workflow} / {job_name}" ) + info(msg) return _filter_jobs( test_matrix=test_matrix, issue_type=issue_type, ) if target_job_cfg == BUILD_JOB_NAME: - print( + msg = ( f"Issue {target_url} created by {author} has {issue_type.value} " + f"the build job for {workflow} / {job_name}" ) + info(msg) return _filter_jobs( test_matrix=test_matrix, issue_type=issue_type, ) if target_job_cfg in (TEST_JOB_NAME, BUILD_AND_TEST_JOB_NAME): - print( + msg = ( f"Issue {target_url} created by {author} has {issue_type.value} " + f"all the test jobs for {workflow} / {job_name}" ) + info(msg) return _filter_jobs( test_matrix=test_matrix, issue_type=issue_type, @@ -463,7 +477,7 @@ def parse_reenabled_issues(s: Optional[str]) -> List[str]: def get_reenabled_issues(pr_body: str = "") -> List[str]: - default_branch = os.getenv("GIT_DEFAULT_BRANCH", "main") + default_branch = f"origin/{os.environ.get('GIT_DEFAULT_BRANCH', 'main')}" try: commit_messages = subprocess.check_output( f"git cherry -v {default_branch}".split(" ") @@ -474,6 +488,10 @@ def get_reenabled_issues(pr_body: str = "") -> List[str]: return parse_reenabled_issues(pr_body) + parse_reenabled_issues(commit_messages) +def check_for_setting(labels: Set[str], body: str, setting: str) -> bool: + return setting in labels or f"[{setting}]" in body + + def perform_misc_tasks( labels: Set[str], test_matrix: Dict[str, List[Any]], job_name: str, pr_body: str ) -> None: @@ -481,11 +499,24 @@ def perform_misc_tasks( In addition to apply the filter logic, the script also does the following misc tasks to set keep-going and is-unstable variables """ - set_output("keep-going", "keep-going" in labels) + set_output("keep-going", check_for_setting(labels, pr_body, "keep-going")) + set_output( + "ci-verbose-test-logs", + check_for_setting(labels, pr_body, "ci-verbose-test-logs"), + ) + set_output( + "ci-no-test-timeout", check_for_setting(labels, pr_body, "ci-no-test-timeout") + ) + set_output("ci-no-td", check_for_setting(labels, pr_body, "ci-no-td")) + # Only relevant for the one linux distributed cuda job, delete this when TD + # is rolled out completely + set_output( + "ci-td-distributed", check_for_setting(labels, pr_body, "ci-td-distributed") + ) # Obviously, if the job name includes unstable, then this is an unstable job is_unstable = job_name and IssueType.UNSTABLE.value in job_name - if not is_unstable and test_matrix: + if not is_unstable and test_matrix and test_matrix.get("include"): # Even when the job name doesn't mention unstable, we will also mark it as # unstable when the test matrix only includes unstable jobs. Basically, this # logic allows build or build-and-test jobs to be marked as unstable too. @@ -555,6 +586,16 @@ def main() -> None: # No PR number, no tag, we can just return the test matrix as it is filtered_test_matrix = test_matrix + if args.selected_test_configs: + selected_test_configs = { + v.strip().lower() + for v in args.selected_test_configs.split(",") + if v.strip() + } + filtered_test_matrix = filter_selected_test_configs( + filtered_test_matrix, selected_test_configs + ) + if args.event_name == "schedule" and args.schedule == "29 8 * * *": # we don't want to run the mem leak check or disabled tests on normal # periodically scheduled jobs, only the ones at this time @@ -577,7 +618,7 @@ def main() -> None: labels=labels, test_matrix=filtered_test_matrix, job_name=args.job_name, - pr_body=pr_body, + pr_body=pr_body if pr_body else "", ) # Set the filtered test matrix as the output diff --git a/.github/scripts/generate_binary_build_matrix.py b/.github/scripts/generate_binary_build_matrix.py index dcfa328c1875d..1e8bd57d44ac6 100644 --- a/.github/scripts/generate_binary_build_matrix.py +++ b/.github/scripts/generate_binary_build_matrix.py @@ -13,16 +13,16 @@ import os from typing import Dict, List, Optional, Tuple -CUDA_ARCHES = ["11.8", "12.1"] +CUDA_ARCHES = ["11.8", "12.1", "12.4"] -CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1"} +CUDA_ARCHES_FULL_VERSION = {"11.8": "11.8.0", "12.1": "12.1.1", "12.4": "12.4.0"} -CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8"} +CUDA_ARCHES_CUDNN_VERSION = {"11.8": "8", "12.1": "8", "12.4": "8"} -ROCM_ARCHES = ["5.6", "5.7"] +ROCM_ARCHES = ["6.0", "6.1"] CPU_CXX11_ABI_ARCH = ["cpu-cxx11-abi"] @@ -42,7 +42,7 @@ "nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64'" ), "12.1": ( @@ -55,9 +55,23 @@ "nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | " - "nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " "nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64'" ), + "12.4": ( + "nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | " + "nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64'" + ), } @@ -324,7 +338,7 @@ def generate_wheels_matrix( ) # 12.1 linux wheels require PYTORCH_EXTRA_INSTALL_REQUIREMENTS to install - if arch_version in ["12.1", "11.8"] and os == "linux": + if arch_version in ["12.4", "12.1", "11.8"] and os == "linux": ret.append( { "python_version": python_version, @@ -367,5 +381,6 @@ def generate_wheels_matrix( return ret +validate_nccl_dep_consistency("12.4") validate_nccl_dep_consistency("12.1") validate_nccl_dep_consistency("11.8") diff --git a/.github/scripts/generate_ci_workflows.py b/.github/scripts/generate_ci_workflows.py index 1075db4255ed0..5b2b473d2a597 100755 --- a/.github/scripts/generate_ci_workflows.py +++ b/.github/scripts/generate_ci_workflows.py @@ -274,42 +274,6 @@ class OperatingSystem: ] MACOS_BINARY_BUILD_WORKFLOWS = [ - BinaryBuildWorkflow( - os=OperatingSystem.MACOS, - package_type="wheel", - build_configs=generate_binary_build_matrix.generate_wheels_matrix( - OperatingSystem.MACOS - ), - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_WHEEL}, - isolated_workflow=True, - ), - ), - BinaryBuildWorkflow( - os=OperatingSystem.MACOS, - package_type="conda", - build_configs=generate_binary_build_matrix.generate_conda_matrix( - OperatingSystem.MACOS - ), - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_CONDA}, - isolated_workflow=True, - ), - ), - BinaryBuildWorkflow( - os=OperatingSystem.MACOS, - package_type="libtorch", - abi_version=generate_binary_build_matrix.CXX11_ABI, - build_configs=generate_binary_build_matrix.generate_libtorch_matrix( - OperatingSystem.MACOS, - generate_binary_build_matrix.CXX11_ABI, - libtorch_variants=["shared-with-deps"], - ), - ciflow_config=CIFlowConfig( - labels={LABEL_CIFLOW_BINARIES, LABEL_CIFLOW_BINARIES_LIBTORCH}, - isolated_workflow=True, - ), - ), BinaryBuildWorkflow( os=OperatingSystem.MACOS_ARM64, package_type="libtorch", @@ -342,7 +306,8 @@ class OperatingSystem: BinaryBuildWorkflow( os=OperatingSystem.MACOS_ARM64, package_type="conda", - cross_compile_arm64=True, + cross_compile_arm64=False, + macos_runner="macos-13-xlarge", build_configs=generate_binary_build_matrix.generate_conda_matrix( OperatingSystem.MACOS_ARM64 ), @@ -413,7 +378,9 @@ def main() -> None: for template, workflows in template_and_workflows: # added Iterable check to appease the mypy gods if not isinstance(workflows, Iterable): - raise Exception(f"How is workflows not iterable? {workflows}") + raise Exception( # noqa: TRY002 + f"How is workflows not iterable? {workflows}" + ) # noqa: TRY002 for workflow in workflows: workflow.generate_workflow_file(workflow_template=template) diff --git a/.github/scripts/generate_docker_release_matrix.py b/.github/scripts/generate_docker_release_matrix.py index 4dfa016be86cb..49d567ceadf8d 100644 --- a/.github/scripts/generate_docker_release_matrix.py +++ b/.github/scripts/generate_docker_release_matrix.py @@ -4,7 +4,7 @@ Will output a condensed version of the matrix. Will include fllowing: * CUDA version short - * CUDA full verison + * CUDA full version * CUDNN version short * Image type either runtime or devel * Platform linux/arm64,linux/amd64 @@ -21,6 +21,8 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]: ret: List[Dict[str, str]] = [] + # CUDA amd64 Docker images are available as both runtime and devel while + # CPU arm64 image is only available as runtime. for cuda, version in generate_binary_build_matrix.CUDA_ARCHES_FULL_VERSION.items(): for image in DOCKER_IMAGE_TYPES: ret.append( @@ -31,9 +33,19 @@ def generate_docker_matrix() -> Dict[str, List[Dict[str, str]]]: cuda ], "image_type": image, - "platform": "linux/arm64,linux/amd64", + "platform": "linux/amd64", } ) + ret.append( + { + "cuda": "cpu", + "cuda_full_version": "", + "cudnn_version": "", + "image_type": "runtime", + "platform": "linux/arm64", + } + ) + return {"include": ret} diff --git a/.github/scripts/get_aws_session_tokens.py b/.github/scripts/get_aws_session_tokens.py new file mode 100755 index 0000000000000..81a046f92778b --- /dev/null +++ b/.github/scripts/get_aws_session_tokens.py @@ -0,0 +1,13 @@ +#!/usr/bin/env python3 +import boto3 # type: ignore[import] + + +def main() -> None: + creds_dict = boto3.Session().get_credentials().get_frozen_credentials()._asdict() + print(f"export AWS_ACCESS_KEY_ID={creds_dict['access_key']}") + print(f"export AWS_SECRET_ACCESS_KEY={creds_dict['secret_key']}") + print(f"export AWS_SESSION_TOKEN={creds_dict['token']}") + + +if __name__ == "__main__": + main() diff --git a/.github/scripts/get_workflow_job_id.py b/.github/scripts/get_workflow_job_id.py index 75bc7e016175b..28f337a30710a 100644 --- a/.github/scripts/get_workflow_job_id.py +++ b/.github/scripts/get_workflow_job_id.py @@ -4,6 +4,7 @@ import argparse import json +import operator import os import re import sys @@ -126,7 +127,7 @@ def find_job_id_name(args: Any) -> Tuple[str, str]: # Sort the jobs list by start time, in descending order. We want to get the most # recently scheduled job on the runner. - jobs.sort(key=lambda job: job["started_at"], reverse=True) + jobs.sort(key=operator.itemgetter("started_at"), reverse=True) for job in jobs: if job["runner_name"] == args.runner_name: diff --git a/.github/scripts/github_utils.py b/.github/scripts/github_utils.py index 05b95fc916646..d76d32f624d8a 100644 --- a/.github/scripts/github_utils.py +++ b/.github/scripts/github_utils.py @@ -119,6 +119,19 @@ def gh_fetch_json_dict( return cast(Dict[str, Any], _gh_fetch_json_any(url, params, data)) +def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]: + rc = gh_fetch_url( + "https://api.github.com/graphql", + data={"query": query, "variables": kwargs}, + reader=json.load, + ) + if "errors" in rc: + raise RuntimeError( + f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}" + ) + return cast(Dict[str, Any], rc) + + def _gh_post_comment( url: str, comment: str, dry_run: bool = False ) -> List[Dict[str, Any]]: diff --git a/.github/scripts/gitutils.py b/.github/scripts/gitutils.py index 280fa991f9466..1640e4354f90d 100644 --- a/.github/scripts/gitutils.py +++ b/.github/scripts/gitutils.py @@ -155,12 +155,19 @@ def branches_containing_ref( ) return [x.strip() for x in rc.split("\n") if x.strip()] if len(rc) > 0 else [] - def current_branch(self) -> str: - return self._run_git("symbolic-ref", "--short", "HEAD").strip() + def current_branch(self) -> Optional[str]: + try: + return self._run_git("symbolic-ref", "--short", "HEAD").strip() + except RuntimeError: + # we are in detached HEAD state + return None def checkout(self, branch: str) -> None: self._run_git("checkout", branch) + def create_branch_and_checkout(self, branch: str) -> None: + self._run_git("checkout", "-b", branch) + def fetch(self, ref: Optional[str] = None, branch: Optional[str] = None) -> None: if branch is None and ref is None: self._run_git("fetch", self.remote) @@ -273,6 +280,7 @@ def compute_branch_diffs( def cherry_pick_commits(self, from_branch: str, to_branch: str) -> None: orig_branch = self.current_branch() + assert orig_branch is not None, "Must be on a branch" self.checkout(to_branch) from_commits, to_commits = self.compute_branch_diffs(from_branch, to_branch) if len(from_commits) == 0: diff --git a/.github/scripts/gql_mocks.json.gz b/.github/scripts/gql_mocks.json.gz index d11489b339031..31a5230dbae9a 100644 Binary files a/.github/scripts/gql_mocks.json.gz and b/.github/scripts/gql_mocks.json.gz differ diff --git a/.github/scripts/label_utils.py b/.github/scripts/label_utils.py index 64128c065c663..eeb82ec316081 100644 --- a/.github/scripts/label_utils.py +++ b/.github/scripts/label_utils.py @@ -74,15 +74,23 @@ def gh_get_labels(org: str, repo: str) -> List[str]: def gh_add_labels( - org: str, repo: str, pr_num: int, labels: Union[str, List[str]] + org: str, repo: str, pr_num: int, labels: Union[str, List[str]], dry_run: bool ) -> None: + if dry_run: + print(f"Dryrun: Adding labels {labels} to PR {pr_num}") + return gh_fetch_url_and_headers( url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels", data={"labels": labels}, ) -def gh_remove_label(org: str, repo: str, pr_num: int, label: str) -> None: +def gh_remove_label( + org: str, repo: str, pr_num: int, label: str, dry_run: bool +) -> None: + if dry_run: + print(f"Dryrun: Removing {label} from PR {pr_num}") + return gh_fetch_url_and_headers( url=f"https://api.github.com/repos/{org}/{repo}/issues/{pr_num}/labels/{label}", method="DELETE", diff --git a/.github/scripts/lintrunner.sh b/.github/scripts/lintrunner.sh new file mode 100755 index 0000000000000..82f472b0f16b7 --- /dev/null +++ b/.github/scripts/lintrunner.sh @@ -0,0 +1,47 @@ +#!/usr/bin/env bash +set -ex + +# The generic Linux job chooses to use base env, not the one setup by the image +CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") +eval "$(command conda 'shell.bash' 'hook' 2> /dev/null)" +conda activate "${CONDA_ENV}" + +# Use uv to speed up lintrunner init +python3 -m pip install uv + +CACHE_DIRECTORY="/tmp/.lintbin" +# Try to recover the cached binaries +if [[ -d "${CACHE_DIRECTORY}" ]]; then + # It's ok to fail this as lintrunner init would download these binaries + # again if they do not exist + cp -r "${CACHE_DIRECTORY}" . || true +fi + +# This has already been cached in the docker image +lintrunner init 2> /dev/null + +# Do build steps necessary for linters +if [[ "${CLANG}" == "1" ]]; then + python3 -m tools.linter.clang_tidy.generate_build_files +fi +python3 -m tools.generate_torch_version --is_debug=false +python3 -m tools.pyi.gen_pyi \ + --native-functions-path aten/src/ATen/native/native_functions.yaml \ + --tags-path aten/src/ATen/native/tags.yaml \ + --deprecated-functions-path "tools/autograd/deprecated.yaml" + +RC=0 +# Run lintrunner on all files +if ! lintrunner --force-color --all-files --tee-json=lint.json ${ADDITIONAL_LINTRUNNER_ARGS} 2> /dev/null; then + echo "" + echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner -m origin/main\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m" + echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m" + RC=1 +fi + +# Use jq to massage the JSON lint output into GitHub Actions workflow commands. +jq --raw-output \ + '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \ + lint.json || true + +exit $RC diff --git a/.github/scripts/s390x-ci/README.md b/.github/scripts/s390x-ci/README.md new file mode 100644 index 0000000000000..f62b02e24aa3e --- /dev/null +++ b/.github/scripts/s390x-ci/README.md @@ -0,0 +1,51 @@ +# Configuring the builder. + +## Install prerequisites. + +``` +$ sudo dnf install docker +``` + +## Add services. + +``` +$ sudo cp self-hosted-builder/*.service /etc/systemd/system/ +$ sudo systemctl daemon-reload +``` + +## Download qemu-user-static image + +``` +# sudo docker pull docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1 +``` + +## Autostart the x86_64 emulation support. + +``` +$ sudo systemctl enable --now qemu-user-static +``` + +## Rebuild the image + +In order to build or update the `iiilinuxibmcom/actions-runner` image, e.g. to get the +latest OS security fixes, use the following commands: + +``` +$ cd self-hosted-builder +$ sudo docker build \ + --build-arg repo=/ \ + --build-arg token=<***> \ + --pull \ + -f actions-runner.Dockerfile \ + -t iiilinuxibmcom/actions-runner \ + . +``` + +If it fails, ensure that selinux doesn't prevent it from working. +In worst case, selinux can be disabled with `setenforce 0`. + +## Autostart the runner. + +``` +$ sudo systemctl enable --now actions-runner@$NAME +``` diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile new file mode 100644 index 0000000000000..416a6d8e50df5 --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner.Dockerfile @@ -0,0 +1,66 @@ +# Self-Hosted IBM Z Github Actions Runner. + +# Temporary image: amd64 dependencies. +FROM docker.io/amd64/ubuntu:22.04 as ld-prefix +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get -y install ca-certificates libicu70 libssl3 + +# Main image. +FROM docker.io/s390x/ubuntu:22.04 + +# Packages for pytorch building and testing. +ENV DEBIAN_FRONTEND=noninteractive +RUN apt-get update && apt-get -y install \ + cmake \ + curl \ + gcc \ + git \ + jq \ + libxml2-dev \ + libxslt-dev \ + ninja-build \ + python-is-python3 \ + python3 \ + python3-dev \ + python3-pip \ + pybind11-dev \ + python3-numpy \ + libopenblas-dev \ + liblapack-dev \ + libgloo-dev \ + python3-yaml \ + python3-scipy \ + virtualenv + +# amd64 dependencies. +COPY --from=ld-prefix / /usr/x86_64-linux-gnu/ +RUN ln -fs ../lib/x86_64-linux-gnu/ld-linux-x86-64.so.2 /usr/x86_64-linux-gnu/lib64/ +RUN ln -fs /etc/resolv.conf /usr/x86_64-linux-gnu/etc/ +ENV QEMU_LD_PREFIX=/usr/x86_64-linux-gnu + +# Scripts. +COPY fs/ / + +RUN chmod +x /usr/bin/actions-runner /usr/bin/entrypoint + +# amd64 Github Actions Runner. +RUN useradd -m actions-runner +USER actions-runner +WORKDIR /home/actions-runner +RUN curl -L https://github.com/actions/runner/releases/download/v2.309.0/actions-runner-linux-x64-2.309.0.tar.gz | tar -xz + +# repository +ARG repo + +# repository token +ARG token + +RUN ./config.sh \ + --unattended \ + --url "https://github.com/${repo}" \ + --token "${token}" \ + --no-default-labels \ + --labels self-hosted,linux.s390x + +ENTRYPOINT ["/usr/bin/entrypoint"] +CMD ["/usr/bin/actions-runner"] diff --git a/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service new file mode 100644 index 0000000000000..158be9ccb6c1d --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/actions-runner@.service @@ -0,0 +1,22 @@ +[Unit] +Description=Self-Hosted IBM Z Github Actions Runner +Wants=qemu-user-static +After=qemu-user-static +StartLimitIntervalSec=0 + +[Service] +Type=simple +Restart=always +ExecStartPre=-/usr/bin/docker rm --force actions-runner.%i +ExecStart=/usr/bin/docker run \ + --init \ + --interactive \ + --name=actions-runner.%i \ + --rm \ + iiilinuxibmcom/actions-runner +ExecStop=/bin/sh -c "docker exec actions-runner.%i kill -INT -- -1" +ExecStop=/bin/sh -c "docker wait actions-runner.%i" +ExecStop=/bin/sh -c "docker rm actions-runner.%i" + +[Install] +WantedBy=multi-user.target diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner new file mode 100644 index 0000000000000..760784b21c396 --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/actions-runner @@ -0,0 +1,6 @@ +#!/usr/bin/env bash + +set -e -u + +# Run one job. +./run.sh --once diff --git a/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint new file mode 100644 index 0000000000000..14f6c84ca602e --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/fs/usr/bin/entrypoint @@ -0,0 +1,30 @@ +#!/usr/bin/env bash + +# +# Container entrypoint that waits for all spawned processes. +# + +set -e -u + +# Create a FIFO and start reading from its read end. +tempdir=$(mktemp -d "/tmp/done.XXXXXXXXXX") +trap 'rm -r "$tempdir"' EXIT +done="$tempdir/pipe" +mkfifo "$done" +cat "$done" & waiter=$! + +# Start the workload. Its descendants will inherit the FIFO's write end. +status=0 +if [ "$#" -eq 0 ]; then + bash 9>"$done" || status=$? +else + "$@" 9>"$done" || status=$? +fi + +# When the workload and all of its descendants exit, the FIFO's write end will +# be closed and `cat "$done"` will exit. Wait until it happens. This is needed +# in order to handle SelfUpdater, which the workload may start in background +# before exiting. +wait "$waiter" + +exit "$status" diff --git a/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service b/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service new file mode 100644 index 0000000000000..40b6c5b17f3ea --- /dev/null +++ b/.github/scripts/s390x-ci/self-hosted-builder/qemu-user-static.service @@ -0,0 +1,11 @@ +[Unit] +Description=Support for transparent execution of non-native binaries with QEMU user emulation + +[Service] +Type=oneshot +# The source code for iiilinuxibmcom/qemu-user-static is at https://github.com/iii-i/qemu-user-static/tree/v6.1.0-1 +# TODO: replace it with multiarch/qemu-user-static once version >6.1 is available +ExecStart=/usr/bin/docker run --rm --interactive --privileged docker.io/iiilinuxibmcom/qemu-user-static:6.1.0-1 --reset -p yes + +[Install] +WantedBy=multi-user.target diff --git a/.github/scripts/td_llm_indexer.sh b/.github/scripts/td_llm_indexer.sh new file mode 100644 index 0000000000000..97565b5db0386 --- /dev/null +++ b/.github/scripts/td_llm_indexer.sh @@ -0,0 +1,20 @@ +#!/bin/bash + +set -euxo pipefail + +# Download requirements +cd llm-target-determinator +pip install -q -r requirements.txt +cd ../codellama +pip install -e . + +# Run indexer +cd ../llm-target-determinator + +torchrun \ + --standalone \ + --nnodes=1 \ + --nproc-per-node=1 \ + indexer.py \ + --experiment-name indexer-files \ + --granularity FILE diff --git a/.github/scripts/test_fetch_latest_green_commit.py b/.github/scripts/test_fetch_latest_green_commit.py deleted file mode 100644 index 0888d0556d12e..0000000000000 --- a/.github/scripts/test_fetch_latest_green_commit.py +++ /dev/null @@ -1,148 +0,0 @@ -from typing import Any, Dict, List -from unittest import main, mock, TestCase - -from fetch_latest_green_commit import isGreen, WorkflowCheck - -workflowNames = [ - "pull", - "trunk", - "Lint", - "linux-binary-libtorch-pre-cxx11", - "android-tests", - "windows-binary-wheel", - "periodic", - "docker-release-builds", - "nightly", - "pr-labels", - "Close stale pull requests", - "Update S3 HTML indices for download.pytorch.org", - "Create Release", -] - - -def set_workflow_job_status( - workflow: List[Dict[str, Any]], name: str, status: str -) -> List[Dict[str, Any]]: - for check in workflow: - if check["workflowName"] == name: - check["conclusion"] = status - return workflow - - -class TestChecks: - def make_test_checks(self) -> List[Dict[str, Any]]: - workflow_checks = [] - for i in range(len(workflowNames)): - workflow_checks.append( - WorkflowCheck( - workflowName=workflowNames[i], - name="test/job", - jobName="job", - conclusion="success", - )._asdict() - ) - return workflow_checks - - -class TestPrintCommits(TestCase): - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_all_successful(self, mock_get_commit_results: Any) -> None: - "Test with workflows are successful" - workflow_checks = mock_get_commit_results() - self.assertTrue(isGreen("sha", workflow_checks)[0]) - - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_necessary_successful(self, mock_get_commit_results: Any) -> None: - "Test with necessary workflows are successful" - workflow_checks = mock_get_commit_results() - workflow_checks = set_workflow_job_status( - workflow_checks, workflowNames[8], "failed" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, workflowNames[9], "failed" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, workflowNames[10], "failed" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, workflowNames[11], "failed" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, workflowNames[12], "failed" - ) - self.assertTrue(isGreen("sha", workflow_checks)[0]) - - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_necessary_skipped(self, mock_get_commit_results: Any) -> None: - "Test with necessary job (ex: pull) skipped" - workflow_checks = mock_get_commit_results() - workflow_checks = set_workflow_job_status(workflow_checks, "pull", "skipped") - result = isGreen("sha", workflow_checks) - self.assertTrue(result[0]) - - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_skippable_skipped(self, mock_get_commit_results: Any) -> None: - "Test with skippable jobs (periodic and docker-release-builds skipped" - workflow_checks = mock_get_commit_results() - workflow_checks = set_workflow_job_status( - workflow_checks, "periodic", "skipped" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, "docker-release-builds", "skipped" - ) - self.assertTrue(isGreen("sha", workflow_checks)) - - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_necessary_failed(self, mock_get_commit_results: Any) -> None: - "Test with necessary job (ex: Lint) failed" - workflow_checks = mock_get_commit_results() - workflow_checks = set_workflow_job_status(workflow_checks, "Lint", "failed") - result = isGreen("sha", workflow_checks) - self.assertFalse(result[0]) - self.assertEqual(result[1], "Lint checks were not successful") - - @mock.patch( - "fetch_latest_green_commit.get_commit_results", - return_value=TestChecks().make_test_checks(), - ) - def test_skippable_failed(self, mock_get_commit_results: Any) -> None: - "Test with failing skippable jobs (ex: docker-release-builds) should pass" - workflow_checks = mock_get_commit_results() - workflow_checks = set_workflow_job_status( - workflow_checks, "periodic", "skipped" - ) - workflow_checks = set_workflow_job_status( - workflow_checks, "docker-release-builds", "failed" - ) - result = isGreen("sha", workflow_checks) - self.assertTrue(result[0]) - - @mock.patch("fetch_latest_green_commit.get_commit_results", return_value={}) - def test_no_workflows(self, mock_get_commit_results: Any) -> None: - "Test with missing workflows" - workflow_checks = mock_get_commit_results() - result = isGreen("sha", workflow_checks) - self.assertFalse(result[0]) - self.assertEqual( - result[1], - "missing required workflows: pull, trunk, lint, linux-binary", - ) - - -if __name__ == "__main__": - main() diff --git a/.github/scripts/test_filter_test_configs.py b/.github/scripts/test_filter_test_configs.py index 8722ff7a44e9f..2f73d022c3dab 100755 --- a/.github/scripts/test_filter_test_configs.py +++ b/.github/scripts/test_filter_test_configs.py @@ -9,6 +9,7 @@ import yaml from filter_test_configs import ( filter, + filter_selected_test_configs, get_labels, mark_unstable_jobs, parse_reenabled_issues, @@ -17,7 +18,6 @@ remove_disabled_jobs, set_periodic_modes, SUPPORTED_PERIODICAL_MODES, - VALID_TEST_CONFIG_LABELS, ) @@ -273,13 +273,13 @@ def test_filter(self) -> None: testcases = [ { "test_matrix": '{include: [{config: "default", runner: "linux"}]}', - "expected": '{"include": [{"config": "default", "runner": "linux"}]}', - "description": "No match, keep the same test matrix", + "expected": '{"include": []}', + "description": "Request test-config/cfg but the test matrix doesn't have it", }, { "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "plain-cfg"}]}', - "expected": '{"include": [{"config": "default", "runner": "linux"}, {"config": "plain-cfg"}]}', - "description": "No match because there is no prefix or suffix, keep the same test matrix", + "expected": '{"include": []}', + "description": "A valid test config label needs to start with test-config/", }, { "test_matrix": '{include: [{config: "default", runner: "linux"}, {config: "cfg", shard: 1}]}', @@ -294,9 +294,8 @@ def test_filter(self) -> None: ) self.assertEqual(case["expected"], json.dumps(filtered_test_matrix)) - def test_filter_with_valid_label(self) -> None: + def test_filter_with_test_config_label(self) -> None: mocked_labels = {f"{PREFIX}cfg", "ciflow/trunk"} - VALID_TEST_CONFIG_LABELS.add(f"{PREFIX}cfg") testcases = [ { @@ -317,6 +316,51 @@ def test_filter_with_valid_label(self) -> None: ) self.assertEqual(case["expected"], json.dumps(filtered_test_matrix)) + def test_filter_selected_test_configs(self) -> None: + testcases = [ + { + "test_matrix": '{include: [{config: "default"}]}', + "selected_test_configs": "", + "expected": '{"include": [{"config": "default"}]}', + "description": "No selected test configs", + }, + { + "test_matrix": '{include: [{config: "default"}]}', + "selected_test_configs": "foo", + "expected": '{"include": []}', + "description": "A different test config is selected", + }, + { + "test_matrix": '{include: [{config: "default"}]}', + "selected_test_configs": "foo, bar", + "expected": '{"include": []}', + "description": "A different set of test configs is selected", + }, + { + "test_matrix": '{include: [{config: "default"}]}', + "selected_test_configs": "foo, bar,default", + "expected": '{"include": [{"config": "default"}]}', + "description": "One of the test config is selected", + }, + { + "test_matrix": '{include: [{config: "default"}, {config: "bar"}]}', + "selected_test_configs": "foo, bar,Default", + "expected": '{"include": [{"config": "default"}, {"config": "bar"}]}', + "description": "Several test configs are selected", + }, + ] + + for case in testcases: + selected_test_configs = { + v.strip().lower() + for v in case["selected_test_configs"].split(",") + if v.strip() + } + filtered_test_matrix = filter_selected_test_configs( + yaml.safe_load(case["test_matrix"]), selected_test_configs + ) + self.assertEqual(case["expected"], json.dumps(filtered_test_matrix)) + def test_set_periodic_modes(self) -> None: testcases: List[Dict[str, str]] = [ { @@ -636,55 +680,110 @@ def test_mark_unstable_jobs(self, mock_download_json: Any) -> None: @mock.patch("subprocess.check_output") def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None: + def _gen_expected_string( + keep_going: bool = False, + ci_verbose_test_logs: bool = False, + ci_no_test_timeout: bool = False, + ci_no_td: bool = False, + ci_td_distributed: bool = False, + is_unstable: bool = False, + reenabled_issues: str = "", + ) -> str: + return ( + f"keep-going={keep_going}\n" + f"ci-verbose-test-logs={ci_verbose_test_logs}\n" + f"ci-no-test-timeout={ci_no_test_timeout}\n" + f"ci-no-td={ci_no_td}\n" + f"ci-td-distributed={ci_td_distributed}\n" + f"is-unstable={is_unstable}\n" + f"reenabled-issues={reenabled_issues}\n" + ) + mocked_subprocess.return_value = b"" testcases: List[Dict[str, Any]] = [ { "labels": {}, "test_matrix": '{include: [{config: "default"}]}', "job_name": "A job name", - "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n", + "expected": _gen_expected_string(), "description": "No keep-going, no is-unstable", }, { "labels": {"keep-going"}, "test_matrix": '{include: [{config: "default"}]}', "job_name": "A job name", - "expected": "keep-going=True\nis-unstable=False\nreenabled-issues=\n", + "expected": _gen_expected_string(keep_going=True), "description": "Has keep-going, no is-unstable", }, + { + "labels": {}, + "test_matrix": '{include: [{config: "default"}]}', + "job_name": "A job name", + "pr_body": "[keep-going]", + "expected": _gen_expected_string(keep_going=True), + "description": "Keep-going in PR body", + }, + { + "labels": {"ci-verbose-test-logs"}, + "test_matrix": '{include: [{config: "default"}]}', + "job_name": "A job name", + "pr_body": "[ci-no-test-timeout]", + "expected": _gen_expected_string( + ci_verbose_test_logs=True, ci_no_test_timeout=True + ), + "description": "No pipe logs label and no test timeout in PR body", + }, + { + "labels": {"ci-no-test-timeout"}, + "test_matrix": '{include: [{config: "default"}]}', + "job_name": "A job name", + "pr_body": "[ci-verbose-test-logs]", + "expected": _gen_expected_string( + ci_verbose_test_logs=True, ci_no_test_timeout=True + ), + "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)", + }, + { + "labels": {"ci-no-td"}, + "test_matrix": '{include: [{config: "default"}]}', + "job_name": "A job name", + "pr_body": "", + "expected": _gen_expected_string(ci_no_td=True), + "description": "No pipe logs in PR body and no test timeout in label (same as the above but swapped)", + }, { "labels": {}, "test_matrix": '{include: [{config: "default"}]}', "job_name": None, - "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n", + "expected": _gen_expected_string(), "description": "No job name", }, { "labels": {}, "test_matrix": '{include: [{config: "default"}]}', - "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)", - "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n", + "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)", + "expected": _gen_expected_string(is_unstable=True), "description": "Unstable job", }, { "labels": {}, "test_matrix": '{include: [{config: "default"}]}', - "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-12, unstable)", - "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n", + "job_name": "macos-12-py3-arm64 / test (default, 1, 3, macos-m1-stable, unstable)", + "expected": _gen_expected_string(is_unstable=True), "description": "Unstable job", }, { "labels": {}, "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2", unstable: "unstable"}]}', "job_name": "macos-12-py3-arm64 / build", - "expected": "keep-going=False\nis-unstable=True\nreenabled-issues=\n", + "expected": _gen_expected_string(is_unstable=True), "description": "All configs are unstable", }, { "labels": {}, "test_matrix": '{include: [{config: "1", unstable: "unstable"}, {config: "2"}]}', "job_name": "macos-12-py3-arm64 / build", - "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=\n", + "expected": _gen_expected_string(is_unstable=False), "description": "Only mark some configs as unstable", }, { @@ -692,7 +791,7 @@ def test_perform_misc_tasks(self, mocked_subprocess: Any) -> None: "test_matrix": '{include: [{config: "default"}]}', "job_name": "A job name", "pr_body": "resolves #123 fixes #234", - "expected": "keep-going=False\nis-unstable=False\nreenabled-issues=123,234\n", + "expected": _gen_expected_string(reenabled_issues="123,234"), "description": "Reenable some issues", }, ] diff --git a/.github/scripts/test_trymerge.py b/.github/scripts/test_trymerge.py index fbcbe048df14f..2641fd30f348e 100755 --- a/.github/scripts/test_trymerge.py +++ b/.github/scripts/test_trymerge.py @@ -16,6 +16,8 @@ from unittest import main, mock, skip, TestCase from urllib.error import HTTPError +from github_utils import gh_graphql + from gitutils import get_git_remote_name, get_git_repo_dir, GitRepo from trymerge import ( @@ -26,7 +28,6 @@ get_drci_classifications, get_rockset_results, gh_get_team_members, - gh_graphql, GitHubPR, JobCheckState, main as trymerge_main, @@ -140,11 +141,14 @@ def __init__(self) -> None: self.comment_id = 0 self.reason = "this is for testing" self.ignore_current = False + self.check_mergeability = False return Object() -def mock_remove_label(org: str, repo: str, pr_num: str, label: str) -> None: +def mock_remove_label( + org: str, repo: str, pr_num: str, label: str, dry_run: bool +) -> None: pass @@ -201,7 +205,6 @@ def mocked_read_merge_rules(repo: Any, org: str, project: str) -> List[MergeRule approved_by=["pytorch/metamates", "ngimel"], mandatory_checks_name=[ "Lint", - "Facebook CLA Check", "pull / linux-xenial-cuda11.3-py3.7-gcc7 / build", ], ignore_flaky_failures=True, @@ -394,7 +397,7 @@ def test_gql_complexity(self, *args: Any) -> None: def test_gql_retrieve_checksuites(self, *args: Any) -> None: "Fetch comments and conclusions for PR with 60 commits" pr = GitHubPR("pytorch", "pytorch", 94787) - self.assertEqual(len(pr.get_checkrun_conclusions()), 183) + self.assertEqual(len(pr.get_checkrun_conclusions()), 182) def test_team_members(self, *args: Any) -> None: "Test fetching team members works" @@ -431,6 +434,13 @@ def test_get_author_many_reviews(self, *args: Any) -> None: assert pr._reviews is not None # to pacify mypy self.assertGreater(len(pr._reviews), 100) + def get_co_authors(self, *args: Any) -> None: + """Tests that co-authors are recognized""" + pr = GitHubPR("pytorch", "pytorch", 118347) + authors = pr.get_authors() + self.assertIn("kit1980", authors) + self.assertIn("Co-authored-by:", pr.gen_commit_message()) + def test_get_checkruns_many_runs(self, *args: Any) -> None: """Tests that all checkruns can be fetched""" pr = GitHubPR("pytorch", "pytorch", 105260) @@ -731,6 +741,30 @@ def test_get_classifications_unstable(self, *args: Any) -> None: self.assertTrue(len(failed) == 0) self.assertTrue(len(ignorable["UNSTABLE"]) == 1) + # Add another test case where there is no unstable keyword in the job name, but + # the job has already been marked as unstable + pr = GitHubPR("pytorch", "executorch", 3318) + checks = pr.get_checkrun_conclusions() + checks = get_classifications( + pr.pr_num, + pr.project, + checks, + [], + ) + print(checks) + workflow_name = "test-llama-app" + job_name = "mobile-job (android)" + self.assertTrue( + checks[f"Android / {workflow_name} / {job_name}"].classification + == "UNSTABLE" + ) + pending, failed, ignorable = categorize_checks( + checks, list(checks.keys()), ok_failed_checks_threshold=1 + ) + self.assertTrue(len(pending) == 0) + self.assertTrue(len(failed) == 0) + self.assertTrue(len(ignorable["UNSTABLE"]) == 1) + def test_get_classifications_broken_trunk(self, *args: Any) -> None: # The mock merge base is the actual value returned by gh_fetch_merge_base test_cases = [ @@ -822,6 +856,41 @@ def test_ignore_current(self, *args: Any) -> None: self.assertTrue(len(ignorable["FLAKY"]) == 4) self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 2) + def test_get_classifications_wrong_workflow_name(self, *args: Any) -> None: + pr = GitHubPR("pytorch", "pytorch", 123104) + checks = pr.get_checkrun_conclusions() + + check_name = "linux-binary-conda / conda-py3_8-cuda11_8-build / build" + check_name_workflow_path = ".github/workflows/generated-linux-binary-conda-nightly.yml / conda-py3_8-cuda11_8-build / build" + + # Mock a check where the workflow name uses the full path + checks[check_name_workflow_path] = JobCheckState( + check_name_workflow_path, + checks[check_name].url, + checks[check_name].status, + checks[check_name].classification, + checks[check_name].job_id, + checks[check_name].title, + checks[check_name].summary, + ) + del checks[check_name] + + checks = get_classifications( + pr.pr_num, + pr.project, + checks, + [], + ) + pending, failed, ignorable = categorize_checks( + checks, + list(checks.keys()), + ) + + self.assertTrue(len(pending) == 0) + self.assertTrue(len(failed) == 0) + self.assertTrue(len(ignorable["FLAKY"]) == 1) + self.assertTrue(len(ignorable["BROKEN_TRUNK"]) == 0) + @mock.patch("trymerge.read_merge_rules", side_effect=xla_merge_rules) def test_dont_ignore_flaky_failures(self, *args: Any) -> None: """ diff --git a/.github/scripts/trymerge.py b/.github/scripts/trymerge.py index c2b513e47f41c..95311d2d9b836 100755 --- a/.github/scripts/trymerge.py +++ b/.github/scripts/trymerge.py @@ -39,6 +39,7 @@ gh_fetch_json_list, gh_fetch_merge_base, gh_fetch_url, + gh_graphql, gh_post_commit_comment, gh_post_pr_comment, gh_update_pr_state, @@ -122,6 +123,7 @@ def __init__(self, name: str, url: str, status: Optional[str]): workflow { name } + databaseId url } checkRuns(first: 50) { @@ -152,12 +154,14 @@ def __init__(self, name: str, url: str, status: Optional[str]): fragment CommitAuthors on PullRequestCommitConnection { nodes { commit { - author { - user { - login + authors(first: 2) { + nodes { + user { + login + } + email + name } - email - name } oid } @@ -458,19 +462,6 @@ def __init__(self, name: str, url: str, status: Optional[str]): IGNORABLE_FAILED_CHECKS_THESHOLD = 10 -def gh_graphql(query: str, **kwargs: Any) -> Dict[str, Any]: - rc = gh_fetch_url( - "https://api.github.com/graphql", - data={"query": query, "variables": kwargs}, - reader=json.load, - ) - if "errors" in rc: - raise RuntimeError( - f"GraphQL query {query}, args {kwargs} failed: {rc['errors']}" - ) - return cast(Dict[str, Any], rc) - - def gh_get_pr_info(org: str, proj: str, pr_no: int) -> Any: rc = gh_graphql(GH_GET_PR_INFO_QUERY, name=proj, owner=org, number=pr_no) return rc["data"]["repository"]["pullRequest"] @@ -608,6 +599,7 @@ def parse_args() -> Any: parser.add_argument("--revert", action="store_true") parser.add_argument("--force", action="store_true") parser.add_argument("--ignore-current", action="store_true") + parser.add_argument("--check-mergeability", action="store_true") parser.add_argument("--comment-id", type=int) parser.add_argument("--reason", type=str) parser.add_argument("pr_num", type=int) @@ -745,7 +737,7 @@ def get_merge_base(self) -> str: # work for ghstack where the base is the custom branch, i.e. gh/USER/ID/base, # so let's just use main instead self.merge_base = gh_fetch_merge_base( - self.org, self.project, last_commit_oid, "main" + self.org, self.project, last_commit_oid, self.default_branch() ) # Fallback to baseRefOid if the API call fails, i.e. rate limit. Note that baseRefOid @@ -845,14 +837,14 @@ def _fetch_authors(self) -> List[Tuple[str, str]]: def add_authors(info: Dict[str, Any]) -> None: for node in info["commits_with_authors"]["nodes"]: - author_node = node["commit"]["author"] - user_node = author_node["user"] - author = f"{author_node['name']} <{author_node['email']}>" - if user_node is None: - # If author is not github user, user node will be null - authors.append(("", author)) - else: - authors.append((cast(str, user_node["login"]), author)) + for author_node in node["commit"]["authors"]["nodes"]: + user_node = author_node["user"] + author = f"{author_node['name']} <{author_node['email']}>" + if user_node is None: + # If author is not github user, user node will be null + authors.append(("", author)) + else: + authors.append((cast(str, user_node["login"]), author)) info = self.info for _ in range(100): @@ -948,11 +940,6 @@ def get_pr_next_checksuites(checksuites: Any) -> Any: def get_authors(self) -> Dict[str, str]: rc = {} - # TODO: replace with `self.get_commit_count()` when GraphQL pagination can be used - # to fetch all commits, see https://gist.github.com/malfet/4f35321b0c9315bcd7116c7b54d83372 - # and https://support.github.com/ticket/enterprise/1642/1659119 - if self.get_commit_count() <= 250: - assert len(self._fetch_authors()) == self.get_commit_count() for idx in range(len(self._fetch_authors())): rc[self.get_committer_login(idx)] = self.get_committer_author(idx) @@ -1068,6 +1055,7 @@ def merge_ghstack_into( repo: GitRepo, skip_mandatory_checks: bool, comment_id: Optional[int] = None, + skip_all_rule_checks: bool = False, ) -> List["GitHubPR"]: assert self.is_ghstack_pr() ghstack_prs = get_ghstack_prs( @@ -1082,7 +1070,7 @@ def merge_ghstack_into( commit_msg = pr.gen_commit_message( filter_ghstack=True, ghstack_deps=pr_dependencies ) - if pr.pr_num != self.pr_num: + if pr.pr_num != self.pr_num and not skip_all_rule_checks: # Raises exception if matching rule is not found find_matching_merge_rule( pr, @@ -1113,13 +1101,19 @@ def gen_commit_message( msg_body = re.sub(RE_GHSTACK_DESC, "", msg_body) msg = self.get_title() + f" (#{self.pr_num})\n\n" msg += msg_body + + # Mention PR co-authors + for author_login, author_name in self.get_authors().items(): + if author_login != self.get_pr_creator_login(): + msg += f"\nCo-authored-by: {author_name}" + msg += f"\nPull Request resolved: {self.get_pr_url()}\n" msg += f"Approved by: {approved_by_urls}\n" if ghstack_deps: msg += f"ghstack dependencies: {', '.join([f'#{pr.pr_num}' for pr in ghstack_deps])}\n" return msg - def add_numbered_label(self, label_base: str) -> None: + def add_numbered_label(self, label_base: str, dry_run: bool) -> None: labels = self.get_labels() if self.labels is not None else [] full_label = label_base count = 0 @@ -1127,7 +1121,7 @@ def add_numbered_label(self, label_base: str) -> None: if label_base in label: count += 1 full_label = f"{label_base}X{count}" - gh_add_labels(self.org, self.project, self.pr_num, [full_label]) + gh_add_labels(self.org, self.project, self.pr_num, [full_label], dry_run) def merge_into( self, @@ -1157,9 +1151,9 @@ def merge_into( repo.push(self.default_branch(), dry_run) if not dry_run: - self.add_numbered_label(MERGE_COMPLETE_LABEL) + self.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run) for pr in additional_merged_prs: - pr.add_numbered_label(MERGE_COMPLETE_LABEL) + pr.add_numbered_label(MERGE_COMPLETE_LABEL, dry_run) if comment_id and self.pr_num: # When the merge process reaches this part, we can assume that the commit @@ -1199,7 +1193,11 @@ def merge_changes( skip_mandatory_checks: bool = False, comment_id: Optional[int] = None, branch: Optional[str] = None, + skip_all_rule_checks: bool = False, ) -> List["GitHubPR"]: + """ + :param skip_all_rule_checks: If true, skips all rule checks, useful for dry-running merge locally + """ branch_to_merge_into = self.default_branch() if branch is None else branch if repo.current_branch() != branch_to_merge_into: repo.checkout(branch_to_merge_into) @@ -1215,6 +1213,7 @@ def merge_changes( repo, skip_mandatory_checks, comment_id=comment_id, + skip_all_rule_checks=skip_all_rule_checks, ) @@ -1400,7 +1399,10 @@ def find_matching_merge_rule( ) required_checks = list( filter( - lambda x: "EasyCLA" in x or not skip_mandatory_checks, mandatory_checks + lambda x: ("EasyCLA" in x) + or ("Facebook CLA Check" in x) + or not skip_mandatory_checks, + mandatory_checks, ) ) pending_checks, failed_checks, _ = categorize_checks( @@ -1411,6 +1413,13 @@ def find_matching_merge_rule( else 0, ) + # categorize_checks assumes all tests are required if required_checks is empty. + # this is a workaround as we want to keep that behavior for categorize_checks + # generally. + if not required_checks: + pending_checks = [] + failed_checks = [] + hud_link = f"https://hud.pytorch.org/{pr.org}/{pr.project}/commit/{pr.last_commit()['oid']}" if len(failed_checks) > 0: if reject_reason_score < 30000: @@ -1610,28 +1619,59 @@ def remove_job_name_suffix(name: str, replacement: str = ")") -> str: def is_broken_trunk( - name: str, + check: JobCheckState, drci_classifications: Any, ) -> bool: - if not name or not drci_classifications: + if not check or not drci_classifications: return False + name = check.name + job_id = check.job_id + # Consult the list of broken trunk failures from Dr.CI return any( - name == broken_trunk["name"] + (name == broken_trunk["name"]) or (job_id and job_id == broken_trunk["id"]) for broken_trunk in drci_classifications.get("BROKEN_TRUNK", []) ) +def is_unstable( + check: JobCheckState, + drci_classifications: Any, +) -> bool: + if not check or not drci_classifications: + return False + + name = check.name + job_id = check.job_id + + # The job name has the unstable keyword. This is the original way to mark a job + # as unstable on HUD, Dr.CI, and trymerge + if "unstable" in name: + return True + + # Consult the list of unstable failures from Dr.CI + return any( + (name == unstable["name"] or (job_id and job_id == unstable["id"])) + for unstable in drci_classifications.get("UNSTABLE", []) + ) + + def is_flaky( - name: str, + check: JobCheckState, drci_classifications: Any, ) -> bool: - if not name or not drci_classifications: + if not check or not drci_classifications: return False + name = check.name + job_id = check.job_id + # Consult the list of flaky failures from Dr.CI - return any(name == flaky["name"] for flaky in drci_classifications.get("FLAKY", [])) + return any( + (name == flaky["name"] or (job_id and job_id == flaky["id"])) + for flaky in drci_classifications.get("FLAKY", []) + ) def is_invalid_cancel( @@ -1669,7 +1709,19 @@ def get_classifications( # going forward. It's preferable to try calling Dr.CI API directly first # to get the latest results as well as update Dr.CI PR comment drci_classifications = get_drci_classifications(pr_num=pr_num, project=project) - print(f"From Dr.CI API: {json.dumps(drci_classifications)}") + + def get_readable_drci_results(drci_classifications: Any) -> str: + try: + s = f"From Dr.CI API ({pr_num}):\n" + for classification, jobs in drci_classifications.items(): + s += f" {classification}: \n" + for job in jobs: + s += f" {job['id']} {job['name']}\n" + return s + except Exception: + return f"From Dr.CI API: {json.dumps(drci_classifications)}" + + print(get_readable_drci_results(drci_classifications)) # NB: if the latest results from Dr.CI is not available, i.e. when calling from # SandCastle, we fallback to any results we can find on Dr.CI check run summary @@ -1692,7 +1744,7 @@ def get_classifications( if check.status == "SUCCESS" or check.status == "NEUTRAL": continue - if "unstable" in name: + if is_unstable(check, drci_classifications): checks_with_classifications[name] = JobCheckState( check.name, check.url, @@ -1706,7 +1758,7 @@ def get_classifications( # NB: It's important to note that when it comes to ghstack and broken trunk classification, # Dr.CI uses the base of the whole stack - if is_broken_trunk(name, drci_classifications): + if is_broken_trunk(check, drci_classifications): checks_with_classifications[name] = JobCheckState( check.name, check.url, @@ -1718,7 +1770,7 @@ def get_classifications( ) continue - elif is_flaky(name, drci_classifications): + elif is_flaky(check, drci_classifications): checks_with_classifications[name] = JobCheckState( check.name, check.url, @@ -1882,8 +1934,8 @@ def do_revert_prs( pr.org, pr.project, pr.pr_num, revert_message, dry_run=dry_run ) + pr.add_numbered_label("reverted", dry_run) if not dry_run: - pr.add_numbered_label("reverted") gh_post_commit_comment(pr.org, pr.project, commit_sha, revert_msg) gh_update_pr_state(pr.org, pr.project, pr.pr_num) @@ -2053,7 +2105,7 @@ def merge( print(f"Attempting merge of {initial_commit_sha} ({pr_link})") if MERGE_IN_PROGRESS_LABEL not in pr.get_labels(): - gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL]) + gh_add_labels(pr.org, pr.project, pr.pr_num, [MERGE_IN_PROGRESS_LABEL], dry_run) explainer = TryMergeExplainer( skip_mandatory_checks, @@ -2073,8 +2125,7 @@ def merge( check_for_sev(pr.org, pr.project, skip_mandatory_checks) - if skip_mandatory_checks or can_skip_internal_checks(pr, comment_id): - # do not wait for any pending signals if PR is closed as part of co-development process + if skip_mandatory_checks: gh_post_pr_comment( pr.org, pr.project, @@ -2201,8 +2252,7 @@ def merge( # Finally report timeout back msg = f"Merged timed out after {timeout_minutes} minutes. Please contact the pytorch_dev_infra team." msg += f"The last exception was: {last_exception}" - if not dry_run: - gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"]) + gh_add_labels(pr.org, pr.project, pr.pr_num, ["land-failed"], dry_run) raise RuntimeError(msg) @@ -2281,6 +2331,16 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: ) return + if args.check_mergeability: + if pr.is_ghstack_pr(): + get_ghstack_prs(repo, pr) # raises error if out of sync + pr.merge_changes( + repo, + skip_mandatory_checks=True, + skip_all_rule_checks=True, + ) + return + if not args.force and pr.has_invalid_submodule_updates(): message = ( f"This PR updates submodules {', '.join(pr.get_changed_submodules())}\n" @@ -2329,7 +2389,10 @@ def handle_exception(e: Exception, title: str = "Merge failed") -> None: else: print("Missing comment ID or PR number, couldn't upload to Rockset") finally: - gh_remove_label(org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL) + if not args.check_mergeability: + gh_remove_label( + org, project, args.pr_num, MERGE_IN_PROGRESS_LABEL, args.dry_run + ) if __name__ == "__main__": diff --git a/.github/scripts/tryrebase.py b/.github/scripts/tryrebase.py index 641b354ef7dce..39a38aaf364c7 100755 --- a/.github/scripts/tryrebase.py +++ b/.github/scripts/tryrebase.py @@ -60,7 +60,7 @@ def rebase_onto( repo._run_git("rebase", onto_branch, branch) if repo.rev_parse(branch) == repo.rev_parse(onto_branch): - raise Exception(SAME_SHA_ERROR) + raise Exception(SAME_SHA_ERROR) # noqa: TRY002 if dry_run: push_result = repo._run_git("push", "--dry-run", "-f", remote_url, refspec) @@ -100,7 +100,7 @@ def rebase_ghstack_onto( repo._run_git("rebase", onto_branch, orig_ref) if repo.rev_parse(orig_ref) == repo.rev_parse(onto_branch): - raise Exception(SAME_SHA_ERROR) + raise Exception(SAME_SHA_ERROR) # noqa: TRY002 # steal the identity of the committer of the commit on the orig branch email = repo._run_git("log", orig_ref, "--pretty=format:%ae", "-1") @@ -126,7 +126,7 @@ def rebase_ghstack_onto( print(push_result) if ghstack_result.returncode != 0: print(ghstack_result.stderr.decode("utf-8")) - raise Exception(f"\n```{push_result}```") + raise Exception(f"\n```{push_result}```") # noqa: TRY002 # The contents of a successful push result should look like: # Summary of changes (ghstack 0.6.0) diff --git a/.github/scripts/update_commit_hashes.py b/.github/scripts/update_commit_hashes.py deleted file mode 100644 index 095e21d21e773..0000000000000 --- a/.github/scripts/update_commit_hashes.py +++ /dev/null @@ -1,171 +0,0 @@ -import json -import os -import subprocess -from argparse import ArgumentParser -from typing import Any, Dict - -import requests - -UPDATEBOT_TOKEN = os.environ["UPDATEBOT_TOKEN"] -PYTORCHBOT_TOKEN = os.environ["PYTORCHBOT_TOKEN"] -OWNER, REPO = "pytorch", "pytorch" - - -def git_api( - url: str, params: Dict[str, str], type: str = "get", token: str = UPDATEBOT_TOKEN -) -> Any: - headers = { - "Accept": "application/vnd.github.v3+json", - "Authorization": f"token {token}", - } - if type == "post": - return requests.post( - f"https://api.github.com{url}", - data=json.dumps(params), - headers=headers, - ).json() - elif type == "patch": - return requests.patch( - f"https://api.github.com{url}", - data=json.dumps(params), - headers=headers, - ).json() - else: - return requests.get( - f"https://api.github.com{url}", - params=params, - headers=headers, - ).json() - - -def parse_args() -> Any: - parser = ArgumentParser("Rebase PR into branch") - parser.add_argument("--repo-name", type=str) - parser.add_argument("--branch", type=str) - parser.add_argument("--pin-folder", type=str) - return parser.parse_args() - - -def make_pr(repo_name: str, branch_name: str) -> Any: - params = { - "title": f"[{repo_name} hash update] update the pinned {repo_name} hash", - "head": branch_name, - "base": "main", - "body": "This PR is auto-generated nightly by [this action](https://github.com/pytorch/pytorch/blob/main/" - + f".github/workflows/_update-commit-hash.yml).\nUpdate the pinned {repo_name} hash.", - } - response = git_api(f"/repos/{OWNER}/{REPO}/pulls", params, type="post") - print(f"made pr {response['html_url']}") - return response["number"] - - -def approve_pr(pr_number: str) -> None: - params = {"event": "APPROVE"} - # use pytorchbot to approve the pr - git_api( - f"/repos/{OWNER}/{REPO}/pulls/{pr_number}/reviews", - params, - type="post", - token=PYTORCHBOT_TOKEN, - ) - - -def make_comment(pr_number: str, msg: str) -> None: - params = {"body": msg} - # comment with pytorchbot because pytorchmergebot gets ignored - git_api( - f"/repos/{OWNER}/{REPO}/issues/{pr_number}/comments", - params, - type="post", - token=PYTORCHBOT_TOKEN, - ) - - -def close_pr(pr_number: str) -> None: - params = {"state": "closed"} - git_api( - f"/repos/{OWNER}/{REPO}/pulls/{pr_number}", - params, - type="patch", - ) - - -def is_newer_hash(new_hash: str, old_hash: str, repo_name: str) -> bool: - def _get_date(hash: str) -> int: - # this git command prints the unix timestamp of the hash - return int( - subprocess.run( - f"git show --no-patch --no-notes --pretty=%ct {hash}".split(), - capture_output=True, - cwd=f"{repo_name}", - ) - .stdout.decode("utf-8") - .strip() - ) - - return _get_date(new_hash) > _get_date(old_hash) - - -def main() -> None: - args = parse_args() - - branch_name = os.environ["NEW_BRANCH_NAME"] - pr_num = None - - # query to see if a pr already exists - params = { - "q": f"is:pr is:open in:title author:pytorchupdatebot repo:{OWNER}/{REPO} {args.repo_name} hash update", - "sort": "created", - } - response = git_api("/search/issues", params) - if response["total_count"] != 0: - # pr does exist - pr_num = response["items"][0]["number"] - link = response["items"][0]["html_url"] - response = git_api(f"/repos/{OWNER}/{REPO}/pulls/{pr_num}", {}) - branch_name = response["head"]["ref"] - print( - f"pr does exist, number is {pr_num}, branch name is {branch_name}, link is {link}" - ) - - hash = ( - subprocess.run( - f"git rev-parse {args.branch}".split(), - capture_output=True, - cwd=f"{args.repo_name}", - ) - .stdout.decode("utf-8") - .strip() - ) - with open(f"{args.pin_folder}/{args.repo_name}.txt", "r+") as f: - old_hash = f.read().strip() - subprocess.run(f"git checkout {old_hash}".split(), cwd=args.repo_name) - f.seek(0) - f.truncate() - f.write(f"{hash}\n") - if is_newer_hash(hash, old_hash, args.repo_name): - # if there was an update, push to branch - subprocess.run(f"git checkout -b {branch_name}".split()) - subprocess.run(f"git add {args.pin_folder}/{args.repo_name}.txt".split()) - subprocess.run( - "git commit -m".split() + [f"update {args.repo_name} commit hash"] - ) - subprocess.run(f"git push --set-upstream origin {branch_name} -f".split()) - print(f"changes pushed to branch {branch_name}") - if pr_num is None: - # no existing pr, so make a new one and approve it - pr_num = make_pr(args.repo_name, branch_name) - approve_pr(pr_num) - make_comment(pr_num, "@pytorchbot merge") - else: - print( - f"tried to update from old hash: {old_hash} to new hash: {hash} but the old hash seems to be newer, not creating pr" - ) - if pr_num is not None: - make_comment(pr_num, "closing pr as the current hash seems up to date") - close_pr(pr_num) - print(f"closing PR {pr_num}") - - -if __name__ == "__main__": - main() diff --git a/.github/templates/linux_binary_build_workflow.yml.j2 b/.github/templates/linux_binary_build_workflow.yml.j2 index 762c473ce2ff9..d44915f41d85f 100644 --- a/.github/templates/linux_binary_build_workflow.yml.j2 +++ b/.github/templates/linux_binary_build_workflow.yml.j2 @@ -7,6 +7,7 @@ name: !{{ build_environment }} {%- endblock %} + on: push: {%- if branches == "nightly" %} @@ -45,7 +46,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 !{{ common.concurrency(build_environment) }} jobs: diff --git a/.github/templates/macos_binary_build_workflow.yml.j2 b/.github/templates/macos_binary_build_workflow.yml.j2 index 505bde406d6a8..591dc52ef9c01 100644 --- a/.github/templates/macos_binary_build_workflow.yml.j2 +++ b/.github/templates/macos_binary_build_workflow.yml.j2 @@ -48,7 +48,7 @@ env: BUILD_ENVIRONMENT: !{{ build_environment }} GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 {%- if cross_compile_arm64 %} CROSS_COMPILE_ARM64: 1 {% endif %} diff --git a/.github/templates/upload.yml.j2 b/.github/templates/upload.yml.j2 index 62153da0cbf01..2d488d4f14dda 100644 --- a/.github/templates/upload.yml.j2 +++ b/.github/templates/upload.yml.j2 @@ -53,6 +53,9 @@ {%- macro upload_binaries(config, is_windows=False, has_test=True, use_s3=True) -%} !{{ config["build_name"] }}-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read {%- if has_test %} needs: !{{ config["build_name"] }}-test {%- else %} @@ -65,8 +68,6 @@ {%- endif %} secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/_android-build-test.yml b/.github/workflows/_android-build-test.yml index 9cb8bb287a9f1..d599e769b8b6a 100644 --- a/.github/workflows/_android-build-test.yml +++ b/.github/workflows/_android-build-test.yml @@ -131,7 +131,7 @@ jobs: export COMMAND # shellcheck disable=SC2016 - COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./.circleci/scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1' + COMMAND='(echo "sudo chown -R jenkins workspace && cd workspace && ./scripts/build_android_gradle.sh" | docker exec -u jenkins -e BUILD_LITE_INTERPRETER -e GRADLE_OFFLINE=1 -i "$id" bash) 2>&1' echo "${COMMAND}" > ./command.sh && bash ./command.sh # Skip docker push as this job is purely for size analysis purpose. # Result binaries are already in `/home/circleci/project/` as it's mounted instead of copied. diff --git a/.github/workflows/_android-full-build-test.yml b/.github/workflows/_android-full-build-test.yml index 965667b7da7a1..7a0c4377eca4e 100644 --- a/.github/workflows/_android-full-build-test.yml +++ b/.github/workflows/_android-full-build-test.yml @@ -157,7 +157,7 @@ jobs: docker cp "${GITHUB_WORKSPACE}/build_android_install_x86_32" "${ID_X86_32}:/var/lib/jenkins/workspace/build_android_install_x86_32" # run gradle buildRelease - (echo "./.circleci/scripts/build_android_gradle.sh" | docker exec \ + (echo "./scripts/build_android_gradle.sh" | docker exec \ -e BUILD_ENVIRONMENT="pytorch-linux-focal-py3-clang9-android-ndk-r21e-gradle-build" \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e AWS_DEFAULT_REGION \ diff --git a/.github/workflows/_bazel-build-test.yml b/.github/workflows/_bazel-build-test.yml index d7fbdc8b1ded8..ca65ce64bc657 100644 --- a/.github/workflows/_bazel-build-test.yml +++ b/.github/workflows/_bazel-build-test.yml @@ -86,9 +86,14 @@ jobs: with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Check if in a ARC runner + shell: bash + id: check_arc_runner + run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG uses: pytorch/test-infra/.github/actions/setup-nvidia@main - if: ${{ inputs.cuda-version != 'cpu' }} + if: ${{ inputs.cuda-version != 'cpu' && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} - name: Output disk space left run: | diff --git a/.github/workflows/_binary-build-linux.yml b/.github/workflows/_binary-build-linux.yml index 3dee4ba92e7f0..34400149e53ff 100644 --- a/.github/workflows/_binary-build-linux.yml +++ b/.github/workflows/_binary-build-linux.yml @@ -78,7 +78,7 @@ on: jobs: build: runs-on: ${{ inputs.runs_on }} - timeout-minutes: 180 + timeout-minutes: 210 env: PYTORCH_ROOT: ${{ inputs.PYTORCH_ROOT }} BUILDER_ROOT: ${{ inputs.BUILDER_ROOT }} diff --git a/.github/workflows/_binary-upload.yml b/.github/workflows/_binary-upload.yml index 79014e30ef196..1231dd0e8c7d4 100644 --- a/.github/workflows/_binary-upload.yml +++ b/.github/workflows/_binary-upload.yml @@ -59,18 +59,13 @@ on: github-token: required: true description: Github Token - aws-pytorch-uploader-access-key-id: - required: true - description: AWS access key id - aws-pytorch-uploader-secret-access-key: - required: true - description: AWS secret access key conda-pytorchbot-token: required: true description: Conda PyTorchBot token conda-pytorchbot-token-test: required: true description: Conda PyTorchBot token + jobs: upload: runs-on: ubuntu-22.04 @@ -104,6 +99,20 @@ jobs: with: no-sudo: true + - name: Configure AWS credentials(PyTorch account) for nightly + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/nightly' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels + aws-region: us-east-1 + + - name: Configure AWS credentials(PyTorch account) for RC builds + if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels + aws-region: us-east-1 + - name: Download Build Artifacts id: download-artifacts # NB: When the previous build job is skipped, there won't be any artifacts and @@ -135,8 +144,6 @@ jobs: PKG_DIR: "${{ runner.temp }}/artifacts" UPLOAD_SUBFOLDER: "${{ env.DESIRED_CUDA }}" # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.aws-pytorch-uploader-access-key-id }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.aws-pytorch-uploader-secret-access-key }} CONDA_PYTORCHBOT_TOKEN: ${{ secrets.conda-pytorchbot-token }} CONDA_PYTORCHBOT_TOKEN_TEST: ${{ secrets.conda-pytorchbot-token-test }} BUILD_NAME: ${{ inputs.build_name }} diff --git a/.github/workflows/_docs.yml b/.github/workflows/_docs.yml index 6b354fe92606a..069bcb4d2a14e 100644 --- a/.github/workflows/_docs.yml +++ b/.github/workflows/_docs.yml @@ -28,7 +28,21 @@ on: description: | If this is set, our linter will use this to make sure that every other job with the same `sync-tag` is identical. - + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + upload-aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" secrets: GH_PYTORCHBOT_TOKEN: required: false @@ -82,6 +96,14 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux + - name: configure aws credentials + if : ${{ inputs.aws-role-to-assume != '' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: ${{ inputs.aws-role-to-assume }} + role-session-name: gha-linux-test + aws-region: us-east-1 + - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main @@ -97,6 +119,7 @@ jobs: uses: ./.github/actions/download-build-artifacts with: name: ${{ inputs.build-environment }} + s3-bucket: ${{ inputs.s3-bucket }} - name: Generate netrc (only for docs-push) if: inputs.push @@ -156,6 +179,14 @@ jobs: uses: ./.github/actions/chown-workspace if: always() + - name: configure aws credentials + if : ${{ inputs.upload-aws-role-to-assume != '' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: ${{ inputs.upload-aws-role-to-assume }} + role-session-name: gha-linux-test + aws-region: us-east-1 + - name: Upload Python Docs Preview uses: seemethere/upload-artifact-s3@v5 if: ${{ github.event_name == 'pull_request' && matrix.docs_type == 'python' && steps.build-docs.outcome == 'success' }} @@ -163,7 +194,7 @@ jobs: retention-days: 14 s3-bucket: doc-previews if-no-files-found: error - path: pytorch.github.io/docs/main/ + path: pytorch_docs/main/ s3-prefix: pytorch/pytorch/${{ github.event.pull_request.number }} - name: Upload C++ Docs Preview diff --git a/.github/workflows/_linux-build-label.yml b/.github/workflows/_linux-build-label.yml new file mode 100644 index 0000000000000..427f993b48530 --- /dev/null +++ b/.github/workflows/_linux-build-label.yml @@ -0,0 +1,109 @@ +name: linux-build + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + build-generates-artifacts: + required: false + type: boolean + default: true + description: If set, upload generated build artifacts. + build-with-debug: + required: false + type: boolean + default: false + description: If set, build in debug mode. + sync-tag: + required: false + type: string + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + cuda-arch-list: + required: false + type: string + default: "5.2" + description: Runner label to select worker type + runner: + required: false + type: string + default: "linux.2xlarge" + description: | + List of CUDA architectures CI build should target. + test-matrix: + required: false + type: string + description: | + An option JSON description of what test configs to run later on. This + is moved here from the Linux test workflow so that we can apply filter + logic using test-config labels earlier and skip unnecessary builds + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + + outputs: + docker-image: + value: ${{ jobs.build.outputs.docker-image }} + description: The docker image containing the built PyTorch. + test-matrix: + value: ${{ jobs.build.outputs.test-matrix }} + description: An optional JSON description of what test configs to run later on. + +jobs: + build: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: ${{ inputs.runner }} + timeout-minutes: 240 + outputs: + docker-image: ${{ steps.linux-build.outputs.docker-image }} + test-matrix: ${{ steps.linux-build.outputs.test-matrix }} + steps: + - name: Setup SSH (Click me for login details) + uses: pytorch/test-infra/.github/actions/setup-ssh@main + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + + # [pytorch repo ref] + # Use a pytorch/pytorch reference instead of a reference to the local + # checkout because when we run this action we don't *have* a local + # checkout. In other cases you should prefer a local checkout. + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Linux Build + id: linux-build + uses: ./.github/actions/linux-build + with: + build-environment: ${{ inputs.build-environment }} + docker-image-name: ${{ inputs.docker-image-name }} + build-generates-artifacts: ${{ inputs.build-generates-artifacts }} + build-with-debug: ${{ inputs.build-with-debug }} + sync-tag: ${{ inputs.sync-tag }} + cuda-arch-list: ${{ inputs.cuda-arch-list }} + test-matrix: ${{ inputs.test-matrix }} + s3-bucket: ${{ inputs.s3-bucket }} + aws-role-to-assume: ${{ inputs.aws-role-to-assume }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} diff --git a/.github/workflows/_linux-build-rg.yml b/.github/workflows/_linux-build-rg.yml new file mode 100644 index 0000000000000..6c6a4827e1672 --- /dev/null +++ b/.github/workflows/_linux-build-rg.yml @@ -0,0 +1,105 @@ +name: linux-build-rg + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + docker-image-name: + required: true + type: string + description: Name of the base docker image to build with. + build-generates-artifacts: + required: false + type: boolean + default: true + description: If set, upload generated build artifacts. + build-with-debug: + required: false + type: boolean + default: false + description: If set, build in debug mode. + sync-tag: + required: false + type: string + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + cuda-arch-list: + required: false + type: string + default: "5.2" + description: | + List of CUDA architectures CI build should target. + runner-group: + required: false + type: string + default: "arc-lf-linux.2xlarge" + description: Runner group to select group type + test-matrix: + required: false + type: string + description: | + An option JSON description of what test configs to run later on. This + is moved here from the Linux test workflow so that we can apply filter + logic using test-config labels earlier and skip unnecessary builds + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + + outputs: + docker-image: + value: ${{ jobs.build.outputs.docker-image }} + description: The docker image containing the built PyTorch. + test-matrix: + value: ${{ jobs.build.outputs.test-matrix }} + description: An optional JSON description of what test configs to run later on. + +jobs: + build: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: + group: ${{ inputs.runner-group }} + timeout-minutes: 240 + outputs: + docker-image: ${{ steps.linux-build.outputs.docker-image }} + test-matrix: ${{ steps.linux-build.outputs.test-matrix }} + steps: + # [pytorch repo ref] + # Use a pytorch/pytorch reference instead of a reference to the local + # checkout because when we run this action we don't *have* a local + # checkout. In other cases you should prefer a local checkout. + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Linux Build + id: linux-build + uses: ./.github/actions/linux-build + with: + build-environment: ${{ inputs.build-environment }} + docker-image-name: ${{ inputs.docker-image-name }} + build-generates-artifacts: ${{ inputs.build-generates-artifacts }} + build-with-debug: ${{ inputs.build-with-debug }} + sync-tag: ${{ inputs.sync-tag }} + cuda-arch-list: ${{ inputs.cuda-arch-list }} + test-matrix: ${{ inputs.test-matrix }} + s3-bucket: ${{ inputs.s3-bucket }} + aws-role-to-assume: ${{ inputs.aws-role-to-assume }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} diff --git a/.github/workflows/_linux-build.yml b/.github/workflows/_linux-build.yml index 9a88ed70b7f2a..c3bcb0d888dfc 100644 --- a/.github/workflows/_linux-build.yml +++ b/.github/workflows/_linux-build.yml @@ -47,6 +47,23 @@ on: An option JSON description of what test configs to run later on. This is moved here from the Linux test workflow so that we can apply filter logic using test-config labels earlier and skip unnecessary builds + selected-test-configs: + description: | + A comma-separated list of test configurations from the test matrix to keep, + The empty list means we are going to keep every configurations by defaults + required: false + type: string + default: "" + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: Role to assume for downloading artifacts + required: false + type: string + default: "" secrets: HUGGING_FACE_HUB_TOKEN: required: false @@ -87,6 +104,14 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux + - name: configure aws credentials + uses: aws-actions/configure-aws-credentials@v3 + if: ${{ inputs.aws-role-to-assume != '' }} + with: + role-to-assume: ${{ inputs.aws-role-to-assume }} + role-session-name: gha-linux-build + aws-region: us-east-1 + - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main @@ -125,6 +150,7 @@ jobs: with: github-token: ${{ secrets.GITHUB_TOKEN }} test-matrix: ${{ inputs.test-matrix }} + selected-test-configs: ${{ inputs.selected-test-configs }} job-name: ${{ steps.get-job-id.outputs.job-name }} - name: Download pytest cache @@ -133,6 +159,7 @@ jobs: with: cache_dir: .pytest_cache job_identifier: ${{ github.workflow }}_${{ inputs.build-environment }} + s3_bucket: ${{ inputs.s3-bucket }} - name: Build if: steps.filter.outputs.is-test-matrix-empty == 'False' || inputs.test-matrix == '' @@ -197,6 +224,7 @@ jobs: retention-days: 14 if-no-files-found: error path: artifacts.zip + s3-bucket: ${{ inputs.s3-bucket }} - name: Upload sccache stats if: steps.build.outcome != 'skipped' @@ -207,6 +235,7 @@ jobs: retention-days: 365 if-no-files-found: warn path: sccache-stats-*.json + s3-bucket: ${{ inputs.s3-bucket }} - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main diff --git a/.github/workflows/_linux-test-label.yml b/.github/workflows/_linux-test-label.yml new file mode 100644 index 0000000000000..7056c0168a19e --- /dev/null +++ b/.github/workflows/_linux-test-label.yml @@ -0,0 +1,85 @@ +name: linux-test-rg + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + docker-image: + required: true + type: string + description: Docker image to run in. + sync-tag: + required: false + type: string + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + timeout-minutes: + required: false + type: number + default: 240 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + use-gha: + required: false + type: string + default: "" + description: If set to any value, upload to GHA. Otherwise upload to S3. + dashboard-tag: + required: false + type: string + default: "" + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + +env: + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos or empty test matrix + if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: ${{ matrix.runner }} + timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Linux Test + id: linux-test + uses: ./.github/actions/linux-test + with: + build-environment: ${{ inputs.build-environment }} + test-matrix: ${{ inputs.test-matrix }} + docker-image: ${{ inputs.docker-image }} + sync-tag: ${{ inputs.sync-tag }} + use-gha: ${{ inputs.use-gha }} + dashboard-tag: ${{ inputs.dashboard-tag }} + s3-bucket: ${{ inputs.s3-bucket }} + aws-role-to-assume: ${{ inputs.aws-role-to-assume }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/_linux-test-rg.yml b/.github/workflows/_linux-test-rg.yml new file mode 100644 index 0000000000000..6dc2f6c63bf3e --- /dev/null +++ b/.github/workflows/_linux-test-rg.yml @@ -0,0 +1,86 @@ +name: linux-test-label + +on: + workflow_call: + inputs: + build-environment: + required: true + type: string + description: Top-level label for what's being built/tested. + test-matrix: + required: true + type: string + description: JSON description of what test configs to run. + docker-image: + required: true + type: string + description: Docker image to run in. + sync-tag: + required: false + type: string + default: "" + description: | + If this is set, our linter will use this to make sure that every other + job with the same `sync-tag` is identical. + timeout-minutes: + required: false + type: number + default: 240 + description: | + Set the maximum (in minutes) how long the workflow should take to finish + use-gha: + required: false + type: string + default: "" + description: If set to any value, upload to GHA. Otherwise upload to S3. + dashboard-tag: + required: false + type: string + default: "" + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" + secrets: + HUGGING_FACE_HUB_TOKEN: + required: false + description: | + HF Auth token to avoid rate limits when downloading models or datasets from hub + +env: + GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} + +jobs: + test: + # Don't run on forked repos or empty test matrix + if: github.repository_owner == 'pytorch' && toJSON(fromJSON(inputs.test-matrix).include) != '[]' + strategy: + matrix: ${{ fromJSON(inputs.test-matrix) }} + fail-fast: false + runs-on: + group: ${{ matrix.runner }} + timeout-minutes: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} + steps: + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + + - name: Linux Test + id: linux-test + uses: ./.github/actions/linux-test + with: + build-environment: ${{ inputs.build-environment }} + test-matrix: ${{ inputs.test-matrix }} + docker-image: ${{ inputs.docker-image }} + sync-tag: ${{ inputs.sync-tag }} + use-gha: ${{ inputs.use-gha }} + dashboard-tag: ${{ inputs.dashboard-tag }} + s3-bucket: ${{ inputs.s3-bucket }} + aws-role-to-assume: ${{ inputs.aws-role-to-assume }} + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/_linux-test.yml b/.github/workflows/_linux-test.yml index 1d14950549a8e..5f3f290dd31da 100644 --- a/.github/workflows/_linux-test.yml +++ b/.github/workflows/_linux-test.yml @@ -37,6 +37,16 @@ on: required: false type: string default: "" + s3-bucket: + description: S3 bucket to download artifact + required: false + type: string + default: "gha-artifacts" + aws-role-to-assume: + description: role to assume for downloading artifacts + required: false + type: string + default: "" secrets: HUGGING_FACE_HUB_TOKEN: required: false @@ -71,6 +81,14 @@ jobs: - name: Setup Linux uses: ./.github/actions/setup-linux + - name: configure aws credentials + if : ${{ inputs.aws-role-to-assume != '' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: ${{ inputs.aws-role-to-assume }} + role-session-name: gha-linux-test + aws-region: us-east-1 + - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main @@ -91,10 +109,15 @@ jobs: with: docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + - name: Check if in a ARC runner + shell: bash + id: check_arc_runner + run: echo "IN_ARC_RUNNER=$([ -f /.inarc ] && echo true || echo false)" >> "$GITHUB_OUTPUT" + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG id: install-nvidia-driver uses: pytorch/test-infra/.github/actions/setup-nvidia@main - if: contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') + if: ${{ contains(inputs.build-environment, 'cuda') && !contains(matrix.config, 'nogpu') && steps.check_arc_runner.outputs.IN_ARC_RUNNER == 'false' }} - name: Lock NVIDIA A100 40GB Frequency run: | @@ -116,6 +139,11 @@ jobs: uses: ./.github/actions/download-build-artifacts with: name: ${{ inputs.build-environment }} + s3-bucket: ${{ inputs.s3-bucket }} + + - name: Download TD artifacts + continue-on-error: true + uses: ./.github/actions/download-td-artifacts - name: Parse ref id: parse-ref @@ -169,6 +197,10 @@ jobs: NUM_TEST_SHARDS: ${{ matrix.num_shards }} REENABLED_ISSUES: ${{ steps.keep-going.outputs.reenabled-issues }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} + TD_DISTRIBUTED: ${{ steps.keep-going.outputs.ci-td-distributed }} SCCACHE_BUCKET: ossci-compiler-cache-circleci-v2 SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} SHM_SIZE: ${{ contains(inputs.build-environment, 'cuda') && '2g' || '1g' }} @@ -218,6 +250,10 @@ jobs: -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ + -e VERBOSE_TEST_LOGS \ + -e NO_TEST_TIMEOUT \ + -e NO_TD \ + -e TD_DISTRIBUTED \ -e PR_LABELS \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ @@ -230,7 +266,6 @@ jobs: -e HUGGING_FACE_HUB_TOKEN \ -e DASHBOARD_TAG \ --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ - --ulimit stack=10485760:83886080 \ --security-opt seccomp=unconfined \ --cap-add=SYS_PTRACE \ --ipc=host \ @@ -280,6 +315,7 @@ jobs: with: file-suffix: ${{ github.job }}-${{ matrix.config }}-${{ matrix.shard }}-${{ matrix.num_shards }}-${{ matrix.runner }}_${{ steps.get-job-id.outputs.job-id }} use-gha: ${{ inputs.use-gha }} + s3-bucket: ${{ inputs.s3-bucket }} - name: Collect backtraces from coredumps (if any) if: always() diff --git a/.github/workflows/_mac-test-mps.yml b/.github/workflows/_mac-test-mps.yml index b10c1f84bd7ff..2c0da2f8afd7c 100644 --- a/.github/workflows/_mac-test-mps.yml +++ b/.github/workflows/_mac-test-mps.yml @@ -34,12 +34,14 @@ jobs: test-matrix: ${{ steps.filter.outputs.test-matrix }} is-test-matrix-empty: ${{ steps.filter.outputs.is-test-matrix-empty }} keep-going: ${{ steps.filter.outputs.keep-going }} + ci-verbose-test-logs: ${{ steps.filter.outputs.ci-verbose-test-logs }} + ci-no-test-timeout: ${{ steps.filter.outputs.ci-no-test-timeout }} + ci-no-td: ${{ steps.filter.outputs.ci-no-td }} reenabled-issues: ${{ steps.filter.outputs.reenabled-issues }} steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: - fetch-depth: 1 submodules: false - name: Select all requested test configurations @@ -95,6 +97,9 @@ jobs: PY_VERS: 3.9 PR_BODY: ${{ github.event.pull_request.body }} CONTINUE_THROUGH_ERROR: ${{ needs.filter.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ needs.filter.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ needs.filter.outputs.ci-no-test-timeout }} + NO_TD: ${{ needs.filter.outputs.ci-no-td }} PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt REENABLED_ISSUES: ${{ needs.filter.outputs.reenabled-issues }} run: | diff --git a/.github/workflows/_mac-test.yml b/.github/workflows/_mac-test.yml index 4848a566f15ed..b8e90771ec73b 100644 --- a/.github/workflows/_mac-test.yml +++ b/.github/workflows/_mac-test.yml @@ -91,6 +91,12 @@ jobs: name: ${{ inputs.build-environment }} use-gha: true + - name: Download TD artifacts + continue-on-error: true + uses: ./.github/actions/download-td-artifacts + with: + use-gha: true + - name: Setup miniconda uses: pytorch/test-infra/.github/actions/setup-miniconda@main with: @@ -148,6 +154,9 @@ jobs: PYTORCH_TEST_CUDA_MEM_LEAK_CHECK: ${{ matrix.mem_leak_check && '1' || '0' }} PYTORCH_TEST_RERUN_DISABLED_TESTS: ${{ matrix.rerun_disabled_tests && '1' || '0' }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} PIP_REQUIREMENTS_FILE: .github/requirements/pip-requirements-${{ runner.os }}.txt GITHUB_REPOSITORY: ${{ github.repository }} GITHUB_WORKFLOW: ${{ github.workflow }} diff --git a/.github/workflows/_rocm-test.yml b/.github/workflows/_rocm-test.yml index 649cae5a2c20b..1f2d86273ee14 100644 --- a/.github/workflows/_rocm-test.yml +++ b/.github/workflows/_rocm-test.yml @@ -42,6 +42,10 @@ on: env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} +permissions: + id-token: write + contents: read + jobs: test: # Don't run on forked repos or empty test matrix @@ -61,6 +65,19 @@ jobs: - name: Setup ROCm uses: ./.github/actions/setup-rocm + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only + aws-region: us-east-1 + role-duration-seconds: 18000 + + - name: Login to Amazon ECR + id: login-ecr + continue-on-error: true + uses: aws-actions/amazon-ecr-login@v2 + - name: Calculate docker image id: calculate-docker-image uses: pytorch/test-infra/.github/actions/calculate-docker-image@main @@ -86,6 +103,10 @@ jobs: with: name: ${{ inputs.build-environment }} + - name: Download TD artifacts + continue-on-error: true + uses: ./.github/actions/download-td-artifacts + - name: Parse ref id: parse-ref run: .github/scripts/parse_ref.py @@ -132,6 +153,9 @@ jobs: BRANCH: ${{ steps.parse-ref.outputs.branch }} SHA1: ${{ github.event.pull_request.head.sha || github.sha }} CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} @@ -180,6 +204,9 @@ jobs: -e NUM_TEST_SHARDS \ -e REENABLED_ISSUES \ -e CONTINUE_THROUGH_ERROR \ + -e VERBOSE_TEST_LOGS \ + -e NO_TEST_TIMEOUT \ + -e NO_TD \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ diff --git a/.github/workflows/_win-build.yml b/.github/workflows/_win-build.yml index 6d041a19c4dc7..bc381c50628d1 100644 --- a/.github/workflows/_win-build.yml +++ b/.github/workflows/_win-build.yml @@ -128,6 +128,7 @@ jobs: PYTHON_VERSION: "3.8" SCCACHE_BUCKET: "ossci-compiler-cache" SCCACHE_S3_KEY_PREFIX: ${{ github.workflow }} + SCCACHE_REGION: us-east-1 VC_PRODUCT: "BuildTools" VC_VERSION: "" VC_YEAR: "2019" diff --git a/.github/workflows/_win-test.yml b/.github/workflows/_win-test.yml index ebfe4211b34c7..99d037f0355ce 100644 --- a/.github/workflows/_win-test.yml +++ b/.github/workflows/_win-test.yml @@ -25,7 +25,7 @@ on: timeout-minutes: required: false type: number - default: 300 + default: 240 description: | Set the maximum (in minutes) how long the workflow should take to finish @@ -92,7 +92,7 @@ jobs: retry_wait_seconds: 30 command: | set -eu - python3 -m pip install rockset==1.0.3 + python3 -m pip install rockset==1.0.3 'xdoctest>=1.1.0' - name: Start monitoring script id: monitor-script @@ -114,6 +114,10 @@ jobs: run: | tree /F C:\$Env:GITHUB_RUN_ID\build-results + - name: Download TD artifacts + continue-on-error: true + uses: ./.github/actions/download-td-artifacts + - name: Get workflow job id id: get-job-id uses: ./.github/actions/get-workflow-job-id @@ -132,14 +136,26 @@ jobs: test-matrix: ${{ inputs.test-matrix }} job-name: ${{ steps.get-job-id.outputs.job-name }} + - name: Set Test step time + id: test-timeout + shell: bash + env: + JOB_TIMEOUT: ${{ matrix.mem_leak_check == 'mem_leak_check' && 600 || inputs.timeout-minutes }} + run: | + echo "timeout=$((JOB_TIMEOUT-30))" >> "${GITHUB_OUTPUT}" + - name: Test id: test shell: bash + timeout-minutes: ${{ fromJson(steps.test-timeout.outputs.timeout) }} env: USE_CUDA: ${{ inputs.cuda-version != 'cpu' && '1' || '0' }} INSTALL_WINDOWS_SDK: 1 PYTHON_VERSION: 3.8 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} VC_PRODUCT: "BuildTools" VC_VERSION: "" VS_VERSION: "16.8.6" diff --git a/.github/workflows/_xpu-test.yml b/.github/workflows/_xpu-test.yml index 1122454b46fc7..d7af711f8adb4 100644 --- a/.github/workflows/_xpu-test.yml +++ b/.github/workflows/_xpu-test.yml @@ -42,10 +42,6 @@ on: env: GIT_DEFAULT_BRANCH: ${{ github.event.repository.default_branch }} -permissions: - id-token: write - contents: read - jobs: test: # Don't run on forked repos or empty test matrix @@ -67,7 +63,7 @@ jobs: id: aws_creds uses: aws-actions/configure-aws-credentials@v1.7.0 with: - role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_pytorch_artifacts + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_s3_and_ecr_read_only aws-region: us-east-1 - name: Login to Amazon ECR @@ -147,6 +143,9 @@ jobs: PYTORCH_RETRY_TEST_CASES: 1 PYTORCH_OVERRIDE_FLAKY_SIGNAL: 1 CONTINUE_THROUGH_ERROR: ${{ steps.keep-going.outputs.keep-going }} + VERBOSE_TEST_LOGS: ${{ steps.keep-going.outputs.ci-verbose-test-logs }} + NO_TEST_TIMEOUT: ${{ steps.keep-going.outputs.ci-no-test-timeout }} + NO_TD: ${{ steps.keep-going.outputs.ci-no-td }} TEST_CONFIG: ${{ matrix.config }} SHARD_NUMBER: ${{ matrix.shard }} NUM_TEST_SHARDS: ${{ matrix.num_shards }} @@ -189,6 +188,9 @@ jobs: -e PYTORCH_RETRY_TEST_CASES \ -e PYTORCH_OVERRIDE_FLAKY_SIGNAL \ -e CONTINUE_THROUGH_ERROR \ + -e VERBOSE_TEST_LOGS \ + -e NO_TEST_TIMEOUT \ + -e NO_TD \ -e MAX_JOBS="$(nproc --ignore=2)" \ -e SCCACHE_BUCKET \ -e XLA_CLANG_CACHE_S3_BUCKET_NAME \ diff --git a/.github/workflows/auto_request_review.yml b/.github/workflows/auto_request_review.yml index 7c98c2990fba7..25eb72bc2faab 100644 --- a/.github/workflows/auto_request_review.yml +++ b/.github/workflows/auto_request_review.yml @@ -3,11 +3,13 @@ name: Auto Request Review on: pull_request: types: [opened, ready_for_review, reopened] - jobs: auto-request-review: # Don't run on forked repos if: ${{ !github.event.pull_request.head.repo.fork }} + permissions: + contents: read + pull-requests: write name: Auto Request Review runs-on: ubuntu-latest steps: diff --git a/.github/workflows/build-triton-wheel.yml b/.github/workflows/build-triton-wheel.yml index ef73a386ef590..ddba8ff8907cc 100644 --- a/.github/workflows/build-triton-wheel.yml +++ b/.github/workflows/build-triton-wheel.yml @@ -37,7 +37,7 @@ jobs: device: ["cuda", "rocm"] include: - device: "rocm" - rocm_version: "5.7" + rocm_version: "6.1" - device: "cuda" rocm_version: "" timeout-minutes: 40 @@ -119,8 +119,7 @@ jobs: - uses: actions/upload-artifact@v3 with: - # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact - name: pytorch-triton-wheel + name: pytorch-triton-wheel-${{ matrix.py_vers }}-${{ matrix.device }} if-no-files-found: error path: ${{ runner.temp }}/artifacts/* @@ -131,17 +130,41 @@ jobs: upload-wheel: runs-on: ubuntu-22.04 needs: build-wheel + permissions: + id-token: write + contents: read container: image: continuumio/miniconda3:4.12.0 environment: ${{ (github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v'))) && 'conda-aws-upload' || '' }} steps: - uses: actions/checkout@v3 + - name: Configure AWS credentials(PyTorch account) for main + if: ${{ github.event_name == 'push' && github.event.ref == 'refs/heads/main' }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_nightly_build_wheels + aws-region: us-east-1 + + - name: Configure AWS credentials(PyTorch account) for RC builds + if: ${{ github.event_name == 'push' && (startsWith(github.event.ref, 'refs/tags/') && !startsWith(github.event.ref, 'refs/tags/ciflow/')) }} + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: arn:aws:iam::749337293305:role/gha_workflow_test_build_wheels + aws-region: us-east-1 + - name: Download Build Artifacts uses: actions/download-artifact@v3 with: - name: pytorch-triton-wheel - path: ${{ runner.temp }}/artifacts/ + # Download all available artifacts + path: ${{ runner.temp }}/artifacts-all + + - name: Select Wheel Artifacts + shell: bash + run: | + set -x + mkdir -p "${RUNNER_TEMP}/artifacts/" + mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-wheel-*/* "${RUNNER_TEMP}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} @@ -168,9 +191,6 @@ jobs: # to nightly or test UPLOAD_SUBFOLDER: "" PKG_DIR: ${{ runner.temp }}/artifacts - # When running these on pull_request events these should be blank - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} shell: bash run: | set -ex @@ -232,8 +252,7 @@ jobs: - uses: actions/upload-artifact@v3 with: - # NB: Use the same name here and all wheels can be downloaded by referring to the same artifact - name: pytorch-triton-conda + name: pytorch-triton-conda-${{ matrix.py_vers }} if-no-files-found: error path: ${{ runner.temp }}/artifacts/* @@ -253,8 +272,15 @@ jobs: - name: Download Build Artifacts uses: actions/download-artifact@v3 with: - name: pytorch-triton-conda - path: ${{ runner.temp }}/artifacts/ + # Download all available artifacts + path: ${{ runner.temp }}/artifacts-all + + - name: Select Conda Artifacts + shell: bash + run: | + set -x + mkdir -p "${RUNNER_TEMP}/artifacts/" + mv "${RUNNER_TEMP}"/artifacts-all/pytorch-triton-conda-*/* "${RUNNER_TEMP}/artifacts/" - name: Set DRY_RUN (only for tagged pushes) if: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) }} diff --git a/.github/workflows/check-labels.yml b/.github/workflows/check-labels.yml index d44f91936c342..d638d588504f2 100644 --- a/.github/workflows/check-labels.yml +++ b/.github/workflows/check-labels.yml @@ -9,13 +9,14 @@ on: pull_request_target: types: [opened, synchronize, reopened, labeled, unlabeled] branches: [main] - paths-ignore: [.github] - # To allow testing PRs that change workflows. - # May be triggered together with pull_request_target, it's OK. + # To check labels on ghstack PRs. + # Note: as pull_request doesn't trigger on PRs targeting main, + # to test changes to the workflow itself one needs to create + # a PR that targets a gh/**/base branch. pull_request: types: [opened, synchronize, reopened, labeled, unlabeled] - paths: [.github] + branches: [gh/**/base] workflow_dispatch: @@ -26,6 +27,7 @@ concurrency: jobs: check-labels: name: Check labels + if: github.repository_owner == 'pytorch' runs-on: linux.20_04.4x steps: - name: Checkout PyTorch diff --git a/.github/workflows/check_mergeability_ghstack.yml b/.github/workflows/check_mergeability_ghstack.yml index 41994c7ebbf77..562687564054f 100644 --- a/.github/workflows/check_mergeability_ghstack.yml +++ b/.github/workflows/check_mergeability_ghstack.yml @@ -1,29 +1,84 @@ -name: Check mergeability and dependencies for ghstack prs +name: Check mergeability of ghstack PR on: pull_request: - types: [opened, synchronize, reopened, edited] + types: [opened, synchronize, reopened] + branches: [gh/**/base] jobs: - check-regex: + ghstack-mergeability-check: runs-on: ubuntu-latest - outputs: - regex-match: ${{ steps.regex-match.outputs.match }} steps: - uses: actions/checkout@v4 + with: + fetch-depth: 0 + + - name: Setup git + shell: bash + run: | + git config --global user.email "pytorchmergebot@users.noreply.github.com" + git config --global user.name "PyTorch MergeBot" + git fetch origin main:main + + - name: Wait for orig branch + shell: bash + run: | + BRANCH="${{ github.base_ref }}" + echo "$BRANCH" + BRANCH="${BRANCH%/base}/orig" + echo "$BRANCH" + + WAIT_SECONDS=300 + END_WAIT=$((SECONDS+WAIT_SECONDS)) + BRANCH_EXISTS=0 + + while [ $SECONDS -lt $END_WAIT ]; do + git fetch --prune origin "${BRANCH}" || true + if git rev-parse --verify "origin/${BRANCH}"; then + BRANCH_EXISTS=1 + break + fi + echo "Waiting for branch ${BRANCH} to exist..." + sleep 30 # Wait for 30 seconds before retrying + done - - id: regex-match - uses: actions-ecosystem/action-regex-match@d50fd2e7a37d0e617aea3d7ada663bd56862b9cc + if [ $BRANCH_EXISTS -eq 0 ]; then + echo "Branch ${BRANCH} not found after ${WAIT_SECONDS} seconds." + echo "Mergeability check failed for infrastructure reasons." + exit 1 + fi + + - name: Setup Python + uses: actions/setup-python@v4 with: - text: ${{ github.head_ref }} - regex: '^(gh/[^/]+/[0-9]+/)head$' - - pr-dependencies-check: - needs: check-regex - if: ${{ needs.check-regex.outputs.regex-match != '' }} - uses: pytorch/test-infra/.github/workflows/pr-dependencies-check.yml@main - with: - pr_number: ${{ github.event.pull_request.number }} + python-version: '3.8' + cache: pip + architecture: x64 + + - run: pip install pyyaml==6.0 rockset==1.0.3 + shell: bash + + - name: Verify mergeability + shell: bash + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + PR_NUM: ${{ github.event.pull_request.number }} + run: | + set -ex + python3 .github/scripts/trymerge.py --check-mergeability "${PR_NUM}" + + - name: Print debug info + if: failure() + shell: bash + env: + PR_NUM: ${{ github.event.pull_request.number }} + run: | + { + echo "# PR $PR_NUM is not mergeable into main" + echo "To debug, run the diagnostic workflow:" + echo "https://github.com/pytorch/test-infra/actions/workflows/pr-dependencies-check.yml" + } >> "$GITHUB_STEP_SUMMARY" + concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} diff --git a/.github/workflows/cherry-pick.yml b/.github/workflows/cherry-pick.yml new file mode 100644 index 0000000000000..059ad781d748d --- /dev/null +++ b/.github/workflows/cherry-pick.yml @@ -0,0 +1,57 @@ +name: Create a cherry pick from a PR + +on: + repository_dispatch: + types: [try-cherry-pick] + +jobs: + cherry-pick: + name: cherry-pick-pr-${{ github.event.client_payload.pr_num }} + runs-on: ubuntu-latest + environment: cherry-pick-bot + env: + GH_RUN_URL: ${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }} + steps: + - name: Checkout repo + id: checkout + uses: actions/checkout@v3 + with: + fetch-depth: 0 + token: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }} + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.11' + cache: pip + + # Not the direct dependencies but the script uses trymerge + - run: pip install pyyaml==6.0 rockset==1.0.3 + + - name: Setup committer id + run: | + git config --global user.name "PyTorch Bot" + git config --global user.email "pytorchbot@users.noreply.github.com" + + - name: Cherry pick the PR + shell: bash + env: + PR_NUM: ${{ github.event.client_payload.pr_num }} + BRANCH: ${{ github.event.client_payload.branch }} + CLASSIFICATION: ${{ github.event.client_payload.classification }} + FIXES: ${{ github.event.client_payload.fixes || '' }} + ACTOR: ${{ github.actor }} + GITHUB_TOKEN: ${{ secrets.GH_PYTORCHBOT_CHERRY_PICK_TOKEN }} + run: | + set -ex + + python .github/scripts/cherry_pick.py \ + --onto-branch "${BRANCH}" \ + --classification "${CLASSIFICATION}" \ + --fixes "${FIXES}" \ + --github-actor "${ACTOR}" \ + "${PR_NUM}" + +concurrency: + group: cherry-pick-pr-${{ github.event.client_payload.pr_num }} + cancel-in-progress: true diff --git a/.github/workflows/close-nonexistent-disable-issues.yml b/.github/workflows/close-nonexistent-disable-issues.yml index 26c74286114a6..f384295b84b8a 100644 --- a/.github/workflows/close-nonexistent-disable-issues.yml +++ b/.github/workflows/close-nonexistent-disable-issues.yml @@ -6,6 +6,7 @@ on: jobs: close-nonexistent-disable-issues: + environment: rockset-read-only if: github.repository_owner == 'pytorch' runs-on: ubuntu-latest steps: diff --git a/.github/workflows/create_release.yml b/.github/workflows/create_release.yml index ef263c5a3d656..c80b61c22c5e7 100644 --- a/.github/workflows/create_release.yml +++ b/.github/workflows/create_release.yml @@ -15,6 +15,9 @@ jobs: if: ${{ github.repository == 'pytorch/pytorch' }} name: Create Release runs-on: ubuntu-latest + # https://github.com/softprops/action-gh-release?tab=readme-ov-file#permissions + permissions: + contents: write steps: - uses: malfet/checkout@silent-checkout with: diff --git a/.github/workflows/delete_old_branches.yml b/.github/workflows/delete_old_branches.yml new file mode 100644 index 0000000000000..04a0521419a8e --- /dev/null +++ b/.github/workflows/delete_old_branches.yml @@ -0,0 +1,39 @@ +# A workflow that deletes branches of closed PRs + +name: Delete old branches + +on: + schedule: + # Run daily. + - cron: 30 1 * * * + workflow_dispatch: + +concurrency: + group: delete-old-branches + cancel-in-progress: true + +permissions: + contents: write + +jobs: + delete: + if: ${{ github.repository == 'pytorch/pytorch' }} + runs-on: ubuntu-latest + + steps: + - name: Checkout repo + uses: actions/checkout@v3 + with: + fetch-depth: 0 + + - name: Setup Python + uses: actions/setup-python@v4 + with: + python-version: '3.8' + architecture: x64 + check-latest: false + + - name: Delete old branches + run: python .github/scripts/delete_old_branches.py + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} diff --git a/.github/workflows/docker-builds.yml b/.github/workflows/docker-builds.yml index c006b0cfac27d..6d822165895eb 100644 --- a/.github/workflows/docker-builds.yml +++ b/.github/workflows/docker-builds.yml @@ -27,32 +27,40 @@ env: ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine AWS_DEFAULT_REGION: us-east-1 +permissions: read-all + jobs: docker-build: - runs-on: [self-hosted, linux.2xlarge] environment: ${{ (github.ref == 'refs/heads/main' || startsWith(github.event.ref, 'refs/tags/v')) && 'docker-build' || '' }} timeout-minutes: 240 strategy: fail-fast: false matrix: + runner: [linux.12xlarge] + docker-image-name: [ + pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9, + pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks, + pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9, + pytorch-linux-focal-py3.8-clang10, + pytorch-linux-focal-py3.11-clang10, + pytorch-linux-focal-py3.12-clang10, + pytorch-linux-focal-rocm-n-1-py3, + pytorch-linux-focal-rocm-n-py3, + pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12, + pytorch-linux-focal-py3-clang9-android-ndk-r21e, + pytorch-linux-jammy-py3.8-gcc11, + pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks, + pytorch-linux-jammy-xpu-2024.0-py3, + pytorch-linux-jammy-py3-clang15-asan, + pytorch-linux-focal-py3-clang10-onnx, + pytorch-linux-focal-linter, + pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter, + pytorch-linux-jammy-py3-clang12-executorch + ] include: - - docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9 - - docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks - - docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9 - - docker-image-name: pytorch-linux-focal-py3.8-clang10 - - docker-image-name: pytorch-linux-focal-py3.11-clang10 - - docker-image-name: pytorch-linux-focal-rocm-n-1-py3 - - docker-image-name: pytorch-linux-focal-rocm-n-py3 - - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12 - - docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e - - docker-image-name: pytorch-linux-jammy-py3.8-gcc11 - - docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks - - docker-image-name: pytorch-linux-jammy-xpu-2024.0-py3 - - docker-image-name: pytorch-linux-jammy-py3-clang15-asan - - docker-image-name: pytorch-linux-focal-py3-clang10-onnx - - docker-image-name: pytorch-linux-focal-linter - - docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter - - docker-image-name: pytorch-linux-jammy-py3-clang12-executorch + - docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 + runner: linux.arm64.2xlarge + runs-on: [self-hosted, "${{ matrix.runner }}"] env: DOCKER_IMAGE_BASE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/${{ matrix.docker-image-name }} steps: @@ -107,6 +115,8 @@ jobs: - name: Chown workspace uses: ./.github/actions/chown-workspace + with: + ALPINE_IMAGE: 308535385114.dkr.ecr.us-east-1.amazonaws.com/${{ (matrix.runner == 'linux.arm64.2xlarge') && 'arm64v8' || 'tool' }}/alpine if: always() - name: Teardown Linux diff --git a/.github/workflows/docker-release.yml b/.github/workflows/docker-release.yml index bdc4fa3b0c010..4ece88d5e47da 100644 --- a/.github/workflows/docker-release.yml +++ b/.github/workflows/docker-release.yml @@ -7,10 +7,13 @@ on: - Dockerfile - docker.Makefile - .github/workflows/docker-release.yml + - .github/scripts/generate_docker_release_matrix.py push: branches: - nightly tags: + # Final Release tags look like: v1.11.0 + - v[0-9]+.[0-9]+.[0-9]+ # Release candidate tags look like: v1.11.0-rc1 - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - ciflow/nightly/* @@ -28,6 +31,8 @@ env: USE_BUILDX: 1 WITH_PUSH: ${{ github.event_name == 'push' && (github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/v')) }} +permissions: read-all + jobs: generate-matrix: if: github.repository_owner == 'pytorch' @@ -99,6 +104,16 @@ jobs: echo "${RUNNER_TEMP}/bin" >> "${GITHUB_PATH}" # Generate PyTorch version to use echo "PYTORCH_VERSION=$(python3 .github/scripts/generate_pytorch_version.py --no-build-suffix)" >> "${GITHUB_ENV}" + - name: Setup test specific variables + if: ${{ startsWith(github.event.ref, 'refs/tags/v') }} + run: | + if [[ ${{ github.event.ref }} =~ ^refs/tags/v[0-9]+\.[0-9]+\.[0-9]+-rc[0-9]+$ ]]; then + { + echo "DOCKER_IMAGE=pytorch-test"; + echo "INSTALL_CHANNEL=pytorch-test"; + echo "TRITON_VERSION=$(cut -f 1 .ci/docker/triton_version.txt)"; + } >> "${GITHUB_ENV}" + fi - name: Setup nightly specific variables if: ${{ github.event.ref == 'refs/heads/nightly' || startsWith(github.event.ref, 'refs/tags/ciflow/nightly/') }} run: | @@ -115,17 +130,27 @@ jobs: if: ${{ github.event.ref == 'refs/heads/nightly' && matrix.image_type == 'runtime' }} run: | PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-cuda${CUDA_VERSION_SHORT}-cudnn${CUDNN_VERSION}-runtime" + CUDA_SUFFIX="-cu${CUDA_VERSION}" + if [[ ${CUDA_VERSION_SHORT} == "cpu" ]]; then + PYTORCH_DOCKER_TAG="${PYTORCH_VERSION}-runtime" + CUDA_SUFFIX="" + fi PYTORCH_NIGHTLY_COMMIT=$(docker run ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \ python -c 'import torch; print(torch.version.git_version[:7],end="")') docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_DOCKER_TAG}" \ - ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" - docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" + ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" + + docker push ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" + + # Please note, here we ned to pin specific verison of CUDA as with latest label + if [[ ${CUDA_VERSION_SHORT} == "12.1" ]]; then + docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}${CUDA_SUFFIX}" \ + ghcr.io/pytorch/pytorch-nightly:latest + docker push ghcr.io/pytorch/pytorch-nightly:latest + fi - docker tag ghcr.io/pytorch/pytorch-nightly:"${PYTORCH_NIGHTLY_COMMIT}-cu${CUDA_VERSION}" \ - ghcr.io/pytorch/pytorch-nightly:latest - docker push ghcr.io/pytorch/pytorch-nightly:latest - name: Teardown Linux uses: pytorch/test-infra/.github/actions/teardown-linux@main if: always() diff --git a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml index 50e5eb0eef115..79a73abda9f76 100644 --- a/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-aarch64-binary-manywheel-nightly.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-aarch64-binary-manywheel + on: push: # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build @@ -30,7 +31,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-aarch64-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -53,7 +54,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_8-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cpu-aarch64-test: # Testing @@ -78,6 +79,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_8-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch @@ -92,8 +96,6 @@ jobs: build_name: manywheel-py3_8-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -115,7 +117,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_9-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cpu-aarch64-test: # Testing @@ -140,6 +142,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_9-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch @@ -154,8 +159,6 @@ jobs: build_name: manywheel-py3_9-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -177,7 +180,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_10-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cpu-aarch64-test: # Testing @@ -202,6 +205,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_10-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch @@ -216,8 +222,6 @@ jobs: build_name: manywheel-py3_10-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -239,7 +243,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_11-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cpu-aarch64-test: # Testing @@ -264,6 +268,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_11-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch @@ -278,8 +285,6 @@ jobs: build_name: manywheel-py3_11-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -301,7 +306,7 @@ jobs: ALPINE_IMAGE: "arm64v8/alpine" build_name: manywheel-py3_12-cpu-aarch64 build_environment: linux-aarch64-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cpu-aarch64-test: # Testing @@ -326,6 +331,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cpu-aarch64-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_12-cpu-aarch64-test with: PYTORCH_ROOT: /pytorch @@ -340,8 +348,6 @@ jobs: build_name: manywheel-py3_12-cpu-aarch64 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-conda-nightly.yml b/.github/workflows/generated-linux-binary-conda-nightly.yml index 8b5b68c9a1866..50a6d986255f7 100644 --- a/.github/workflows/generated-linux-binary-conda-nightly.yml +++ b/.github/workflows/generated-linux-binary-conda-nightly.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-conda + on: push: # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build @@ -30,7 +31,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -74,6 +75,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cpu-test with: PYTORCH_ROOT: /pytorch @@ -88,8 +92,6 @@ jobs: build_name: conda-py3_8-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -135,6 +137,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_8-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -150,8 +155,6 @@ jobs: build_name: conda-py3_8-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -197,6 +200,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_8-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -212,8 +218,69 @@ jobs: build_name: conda-py3_8-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + conda-py3_8-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + runs_on: linux.24xlarge + build_name: conda-py3_8-cuda12_4 + build_environment: linux-binary-conda + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_8-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + build_name: conda-py3_8-cuda12_4 + build_environment: linux-binary-conda + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_8-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_8-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + build_name: conda-py3_8-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -256,6 +323,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cpu-test with: PYTORCH_ROOT: /pytorch @@ -270,8 +340,6 @@ jobs: build_name: conda-py3_9-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -317,6 +385,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_9-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -332,8 +403,6 @@ jobs: build_name: conda-py3_9-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -379,6 +448,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_9-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -394,8 +466,69 @@ jobs: build_name: conda-py3_9-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + conda-py3_9-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + runs_on: linux.24xlarge + build_name: conda-py3_9-cuda12_4 + build_environment: linux-binary-conda + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_9-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + build_name: conda-py3_9-cuda12_4 + build_environment: linux-binary-conda + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_9-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_9-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + build_name: conda-py3_9-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -438,6 +571,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_10-cpu-test with: PYTORCH_ROOT: /pytorch @@ -452,8 +588,6 @@ jobs: build_name: conda-py3_10-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -499,6 +633,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_10-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_10-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -514,8 +651,6 @@ jobs: build_name: conda-py3_10-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -561,6 +696,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_10-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_10-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -576,8 +714,69 @@ jobs: build_name: conda-py3_10-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + conda-py3_10-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + runs_on: linux.24xlarge + build_name: conda-py3_10-cuda12_4 + build_environment: linux-binary-conda + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_10-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + build_name: conda-py3_10-cuda12_4 + build_environment: linux-binary-conda + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_10-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_10-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + build_name: conda-py3_10-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -620,6 +819,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_11-cpu-test with: PYTORCH_ROOT: /pytorch @@ -634,8 +836,6 @@ jobs: build_name: conda-py3_11-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -681,6 +881,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_11-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_11-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -696,8 +899,6 @@ jobs: build_name: conda-py3_11-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -743,6 +944,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_11-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_11-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -758,8 +962,69 @@ jobs: build_name: conda-py3_11-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + conda-py3_11-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + runs_on: linux.24xlarge + build_name: conda-py3_11-cuda12_4 + build_environment: linux-binary-conda + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_11-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_11-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + build_name: conda-py3_11-cuda12_4 + build_environment: linux-binary-conda + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_11-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_11-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + build_name: conda-py3_11-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -802,6 +1067,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_12-cpu-test with: PYTORCH_ROOT: /pytorch @@ -816,8 +1084,6 @@ jobs: build_name: conda-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -863,6 +1129,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_12-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_12-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -878,8 +1147,6 @@ jobs: build_name: conda-py3_12-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -925,6 +1192,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} conda-py3_12-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_12-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -940,8 +1210,69 @@ jobs: build_name: conda-py3_12-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + conda-py3_12-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + runs_on: linux.24xlarge + build_name: conda-py3_12-cuda12_4 + build_environment: linux-binary-conda + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_12-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_12-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + build_name: conda-py3_12-cuda12_4 + build_environment: linux-binary-conda + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-py3_12-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_12-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/conda-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + build_name: conda-py3_12-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml index 5b2869c793502..5577a5e7d9c3a 100644 --- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-main.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-libtorch-cxx11-abi + on: push: branches: @@ -25,7 +26,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml index 4e887565d569f..d400e82249867 100644 --- a/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-cxx11-abi-nightly.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-libtorch-cxx11-abi + on: push: # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build @@ -30,7 +31,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -76,6 +77,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cpu-shared-with-deps-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -91,8 +95,6 @@ jobs: build_name: libtorch-cpu-shared-with-deps-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -139,6 +141,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda11_8-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda11_8-shared-with-deps-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -155,8 +160,6 @@ jobs: build_name: libtorch-cuda11_8-shared-with-deps-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -203,6 +206,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_1-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda12_1-shared-with-deps-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -219,13 +225,76 @@ jobs: build_name: libtorch-cuda12_1-shared-with-deps-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm5_6-shared-with-deps-cxx11-abi-build: + libtorch-cuda12_4-shared-with-deps-cxx11-abi-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi + build_environment: linux-binary-libtorch-cxx11-abi + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_4-shared-with-deps-cxx11-abi-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi + build_environment: linux-binary-libtorch-cxx11-abi + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_4-shared-with-deps-cxx11-abi-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_4-shared-with-deps-cxx11-abi-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: cxx11-abi + build_name: libtorch-cuda12_4-shared-with-deps-cxx11-abi + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-rocm6_0-shared-with-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -234,19 +303,19 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi - build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi + build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi build_environment: linux-binary-libtorch-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm5_6-shared-with-deps-cxx11-abi-test: # Testing + libtorch-rocm6_0-shared-with-deps-cxx11-abi-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_6-shared-with-deps-cxx11-abi-build + needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -255,11 +324,11 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi steps: @@ -268,7 +337,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: libtorch-rocm5_6-shared-with-deps-cxx11-abi + name: libtorch-rocm6_0-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -301,36 +370,37 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/libtorch-cxx11-builder:rocm5.6-main + docker-image: pytorch/libtorch-cxx11-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm5_6-shared-with-deps-cxx11-abi-upload: # Uploading + libtorch-rocm6_0-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_6-shared-with-deps-cxx11-abi-test + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_0-shared-with-deps-cxx11-abi-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi - build_name: libtorch-rocm5_6-shared-with-deps-cxx11-abi + build_name: libtorch-rocm6_0-shared-with-deps-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm5_7-shared-with-deps-cxx11-abi-build: + libtorch-rocm6_1-shared-with-deps-cxx11-abi-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -339,19 +409,19 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi - build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi + build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi build_environment: linux-binary-libtorch-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm5_7-shared-with-deps-cxx11-abi-test: # Testing + libtorch-rocm6_1-shared-with-deps-cxx11-abi-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-build + needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -360,11 +430,11 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi steps: @@ -373,7 +443,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: libtorch-rocm5_7-shared-with-deps-cxx11-abi + name: libtorch-rocm6_1-shared-with-deps-cxx11-abi path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -406,31 +476,32 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/libtorch-cxx11-builder:rocm5.7-main + docker-image: pytorch/libtorch-cxx11-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm5_7-shared-with-deps-cxx11-abi-upload: # Uploading + libtorch-rocm6_1-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_7-shared-with-deps-cxx11-abi-test + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_1-shared-with-deps-cxx11-abi-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: cxx11-abi - build_name: libtorch-rocm5_7-shared-with-deps-cxx11-abi + build_name: libtorch-rocm6_1-shared-with-deps-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml index 2fec2021b636c..0158860d6f942 100644 --- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-main.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-libtorch-pre-cxx11 + on: push: branches: @@ -25,7 +26,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true diff --git a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml index e93aa4177b530..3205c3c78dad4 100644 --- a/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml +++ b/.github/workflows/generated-linux-binary-libtorch-pre-cxx11-nightly.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-libtorch-pre-cxx11 + on: push: # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build @@ -30,7 +31,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-libtorch-pre-cxx11-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -76,6 +77,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cpu-shared-with-deps-pre-cxx11-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cpu-shared-with-deps-pre-cxx11-test with: PYTORCH_ROOT: /pytorch @@ -91,8 +95,6 @@ jobs: build_name: libtorch-cpu-shared-with-deps-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -139,6 +141,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda11_8-shared-with-deps-pre-cxx11-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda11_8-shared-with-deps-pre-cxx11-test with: PYTORCH_ROOT: /pytorch @@ -155,8 +160,6 @@ jobs: build_name: libtorch-cuda11_8-shared-with-deps-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -203,6 +206,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} libtorch-cuda12_1-shared-with-deps-pre-cxx11-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda12_1-shared-with-deps-pre-cxx11-test with: PYTORCH_ROOT: /pytorch @@ -219,13 +225,76 @@ jobs: build_name: libtorch-cuda12_1-shared-with-deps-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm5_6-shared-with-deps-pre-cxx11-build: + libtorch-cuda12_4-shared-with-deps-pre-cxx11-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11 + build_environment: linux-binary-libtorch-pre-cxx11 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_4-shared-with-deps-pre-cxx11-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11 + build_environment: linux-binary-libtorch-pre-cxx11 + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + libtorch-cuda12_4-shared-with-deps-pre-cxx11-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_4-shared-with-deps-pre-cxx11-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + LIBTORCH_VARIANT: shared-with-deps + DESIRED_DEVTOOLSET: pre-cxx11 + build_name: libtorch-cuda12_4-shared-with-deps-pre-cxx11 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + libtorch-rocm6_0-shared-with-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -234,19 +303,19 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11 + build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11 build_environment: linux-binary-libtorch-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm5_6-shared-with-deps-pre-cxx11-test: # Testing + libtorch-rocm6_0-shared-with-deps-pre-cxx11-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_6-shared-with-deps-pre-cxx11-build + needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -255,11 +324,11 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 steps: @@ -268,7 +337,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: libtorch-rocm5_6-shared-with-deps-pre-cxx11 + name: libtorch-rocm6_0-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -301,36 +370,37 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm5_6-shared-with-deps-pre-cxx11-upload: # Uploading + libtorch-rocm6_0-shared-with-deps-pre-cxx11-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_6-shared-with-deps-pre-cxx11-test + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_0-shared-with-deps-pre-cxx11-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - build_name: libtorch-rocm5_6-shared-with-deps-pre-cxx11 + build_name: libtorch-rocm6_0-shared-with-deps-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - libtorch-rocm5_7-shared-with-deps-pre-cxx11-build: + libtorch-rocm6_1-shared-with-deps-pre-cxx11-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -339,19 +409,19 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11 + build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11 build_environment: linux-binary-libtorch-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - libtorch-rocm5_7-shared-with-deps-pre-cxx11-test: # Testing + libtorch-rocm6_1-shared-with-deps-pre-cxx11-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-build + needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -360,11 +430,11 @@ jobs: PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 steps: @@ -373,7 +443,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: libtorch-rocm5_7-shared-with-deps-pre-cxx11 + name: libtorch-rocm6_1-shared-with-deps-pre-cxx11 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -406,31 +476,32 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - libtorch-rocm5_7-shared-with-deps-pre-cxx11-upload: # Uploading + libtorch-rocm6_1-shared-with-deps-pre-cxx11-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-rocm5_7-shared-with-deps-pre-cxx11-test + permissions: + id-token: write + contents: read + needs: libtorch-rocm6_1-shared-with-deps-pre-cxx11-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: libtorch # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main LIBTORCH_VARIANT: shared-with-deps DESIRED_DEVTOOLSET: pre-cxx11 - build_name: libtorch-rocm5_7-shared-with-deps-pre-cxx11 + build_name: libtorch-rocm6_1-shared-with-deps-pre-cxx11 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-linux-binary-manywheel-main.yml b/.github/workflows/generated-linux-binary-manywheel-main.yml index e10d4ef7f725f..4764ede6bcb2c 100644 --- a/.github/workflows/generated-linux-binary-manywheel-main.yml +++ b/.github/workflows/generated-linux-binary-manywheel-main.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-manywheel + on: push: branches: @@ -25,7 +26,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -47,7 +48,7 @@ jobs: DESIRED_PYTHON: "3.8" build_name: manywheel-py3_8-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda11_8-test: # Testing @@ -87,7 +88,7 @@ jobs: DESIRED_PYTHON: "3.8" build_name: manywheel-py3_8-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda12_1-test: # Testing diff --git a/.github/workflows/generated-linux-binary-manywheel-nightly.yml b/.github/workflows/generated-linux-binary-manywheel-nightly.yml index 783d54b8157c8..8ad43b4c36607 100644 --- a/.github/workflows/generated-linux-binary-manywheel-nightly.yml +++ b/.github/workflows/generated-linux-binary-manywheel-nightly.yml @@ -4,6 +4,7 @@ # Generation script: .github/scripts/generate_ci_workflows.py name: linux-binary-manywheel + on: push: # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build @@ -30,7 +31,7 @@ env: PYTORCH_FINAL_PACKAGE_DIR: /artifacts PYTORCH_ROOT: /pytorch SHA1: ${{ github.event.pull_request.head.sha || github.sha }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: linux-binary-manywheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -74,6 +75,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_8-cpu-test with: PYTORCH_ROOT: /pytorch @@ -88,8 +92,6 @@ jobs: build_name: manywheel-py3_8-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -134,6 +136,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cpu-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_8-cpu-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -149,8 +154,6 @@ jobs: build_name: manywheel-py3_8-cpu-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -171,7 +174,7 @@ jobs: DESIRED_PYTHON: "3.8" build_name: manywheel-py3_8-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda11_8-test: # Testing @@ -196,6 +199,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_8-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -211,8 +217,6 @@ jobs: build_name: manywheel-py3_8-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -233,7 +237,7 @@ jobs: DESIRED_PYTHON: "3.8" build_name: manywheel-py3_8-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda12_1-test: # Testing @@ -258,6 +262,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_8-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_8-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -273,13 +280,11 @@ jobs: build_name: manywheel-py3_8-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_8-rocm5_6-build: + manywheel-py3_8-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -288,18 +293,81 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + build_name: manywheel-py3_8-cuda12_4 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_8-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_8-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + build_name: manywheel-py3_8-cuda12_4 + build_environment: linux-binary-manywheel + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_8-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_8-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.8" + build_name: manywheel-py3_8-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_8-rocm6_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.8" - build_name: manywheel-py3_8-rocm5_6 + build_name: manywheel-py3_8-rocm6_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_8-rocm5_6-test: # Testing + manywheel-py3_8-rocm6_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm5_6-build + needs: manywheel-py3_8-rocm6_0-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -308,11 +376,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.8" steps: - name: Setup ROCm @@ -320,7 +388,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_8-rocm5_6 + name: manywheel-py3_8-rocm6_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -353,35 +421,36 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_8-rocm5_6-upload: # Uploading + manywheel-py3_8-rocm6_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm5_6-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_8-rocm6_0-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.8" - build_name: manywheel-py3_8-rocm5_6 + build_name: manywheel-py3_8-rocm6_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_8-rocm5_7-build: + manywheel-py3_8-rocm6_1-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -390,18 +459,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.8" - build_name: manywheel-py3_8-rocm5_7 + build_name: manywheel-py3_8-rocm6_1 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_8-rocm5_7-test: # Testing + manywheel-py3_8-rocm6_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm5_7-build + needs: manywheel-py3_8-rocm6_1-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -410,11 +479,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.8" steps: - name: Setup ROCm @@ -422,7 +491,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_8-rocm5_7 + name: manywheel-py3_8-rocm6_1 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -455,30 +524,31 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_8-rocm5_7-upload: # Uploading + manywheel-py3_8-rocm6_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_8-rocm5_7-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_8-rocm6_1-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.8" - build_name: manywheel-py3_8-rocm5_7 + build_name: manywheel-py3_8-rocm6_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -521,6 +591,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_9-cpu-test with: PYTORCH_ROOT: /pytorch @@ -535,8 +608,6 @@ jobs: build_name: manywheel-py3_9-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -581,6 +652,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cpu-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_9-cpu-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -596,8 +670,6 @@ jobs: build_name: manywheel-py3_9-cpu-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -618,7 +690,7 @@ jobs: DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda11_8-test: # Testing @@ -643,6 +715,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_9-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -658,8 +733,6 @@ jobs: build_name: manywheel-py3_9-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -680,7 +753,7 @@ jobs: DESIRED_PYTHON: "3.9" build_name: manywheel-py3_9-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_1-test: # Testing @@ -705,6 +778,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_9-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_9-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -720,13 +796,74 @@ jobs: build_name: manywheel-py3_9-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_9-rocm5_6-build: + manywheel-py3_9-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_4 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_9-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_4 + build_environment: linux-binary-manywheel + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_9-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.9" + build_name: manywheel-py3_9-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_9-rocm6_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -735,18 +872,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm5_6 + build_name: manywheel-py3_9-rocm6_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm5_6-test: # Testing + manywheel-py3_9-rocm6_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm5_6-build + needs: manywheel-py3_9-rocm6_0-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -755,11 +892,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.9" steps: - name: Setup ROCm @@ -767,7 +904,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_9-rocm5_6 + name: manywheel-py3_9-rocm6_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -800,35 +937,36 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_9-rocm5_6-upload: # Uploading + manywheel-py3_9-rocm6_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm5_6-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-rocm6_0-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm5_6 + build_name: manywheel-py3_9-rocm6_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_9-rocm5_7-build: + manywheel-py3_9-rocm6_1-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -837,18 +975,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm5_7 + build_name: manywheel-py3_9-rocm6_1 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_9-rocm5_7-test: # Testing + manywheel-py3_9-rocm6_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm5_7-build + needs: manywheel-py3_9-rocm6_1-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -857,11 +995,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.9" steps: - name: Setup ROCm @@ -869,7 +1007,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_9-rocm5_7 + name: manywheel-py3_9-rocm6_1 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -902,30 +1040,31 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_9-rocm5_7-upload: # Uploading + manywheel-py3_9-rocm6_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_9-rocm5_7-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_9-rocm6_1-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.9" - build_name: manywheel-py3_9-rocm5_7 + build_name: manywheel-py3_9-rocm6_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -968,6 +1107,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_10-cpu-test with: PYTORCH_ROOT: /pytorch @@ -982,8 +1124,6 @@ jobs: build_name: manywheel-py3_10-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1028,6 +1168,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cpu-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_10-cpu-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -1043,8 +1186,6 @@ jobs: build_name: manywheel-py3_10-cpu-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1065,7 +1206,7 @@ jobs: DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda11_8-test: # Testing @@ -1090,6 +1231,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_10-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -1105,8 +1249,6 @@ jobs: build_name: manywheel-py3_10-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1127,7 +1269,7 @@ jobs: DESIRED_PYTHON: "3.10" build_name: manywheel-py3_10-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_1-test: # Testing @@ -1152,6 +1294,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_10-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_10-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -1167,13 +1312,11 @@ jobs: build_name: manywheel-py3_10-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-rocm5_6-build: + manywheel-py3_10-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -1182,18 +1325,81 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_4 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_10-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_4 + build_environment: linux-binary-manywheel + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_10-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.10" + build_name: manywheel-py3_10-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_10-rocm6_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm5_6 + build_name: manywheel-py3_10-rocm6_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-rocm5_6-test: # Testing + manywheel-py3_10-rocm6_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm5_6-build + needs: manywheel-py3_10-rocm6_0-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -1202,11 +1408,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm @@ -1214,7 +1420,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_10-rocm5_6 + name: manywheel-py3_10-rocm6_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1247,35 +1453,36 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_10-rocm5_6-upload: # Uploading + manywheel-py3_10-rocm6_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm5_6-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-rocm6_0-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm5_6 + build_name: manywheel-py3_10-rocm6_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_10-rocm5_7-build: + manywheel-py3_10-rocm6_1-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -1284,18 +1491,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm5_7 + build_name: manywheel-py3_10-rocm6_1 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_10-rocm5_7-test: # Testing + manywheel-py3_10-rocm6_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm5_7-build + needs: manywheel-py3_10-rocm6_1-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -1304,11 +1511,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.10" steps: - name: Setup ROCm @@ -1316,7 +1523,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_10-rocm5_7 + name: manywheel-py3_10-rocm6_1 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1349,30 +1556,31 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_10-rocm5_7-upload: # Uploading + manywheel-py3_10-rocm6_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_10-rocm5_7-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_10-rocm6_1-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.10" - build_name: manywheel-py3_10-rocm5_7 + build_name: manywheel-py3_10-rocm6_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1415,6 +1623,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_11-cpu-test with: PYTORCH_ROOT: /pytorch @@ -1429,8 +1640,6 @@ jobs: build_name: manywheel-py3_11-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1475,6 +1684,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cpu-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_11-cpu-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -1490,8 +1702,6 @@ jobs: build_name: manywheel-py3_11-cpu-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1512,7 +1722,7 @@ jobs: DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda11_8-test: # Testing @@ -1537,6 +1747,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_11-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -1552,8 +1765,6 @@ jobs: build_name: manywheel-py3_11-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1574,7 +1785,7 @@ jobs: DESIRED_PYTHON: "3.11" build_name: manywheel-py3_11-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_1-test: # Testing @@ -1599,6 +1810,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_11-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_11-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -1614,13 +1828,11 @@ jobs: build_name: manywheel-py3_11-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-rocm5_6-build: + manywheel-py3_11-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -1629,18 +1841,81 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_4 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_11-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_4 + build_environment: linux-binary-manywheel + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_11-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.11" + build_name: manywheel-py3_11-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_11-rocm6_0-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm5_6 + build_name: manywheel-py3_11-rocm6_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-rocm5_6-test: # Testing + manywheel-py3_11-rocm6_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_11-rocm5_6-build + needs: manywheel-py3_11-rocm6_0-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -1649,11 +1924,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.11" steps: - name: Setup ROCm @@ -1661,7 +1936,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_11-rocm5_6 + name: manywheel-py3_11-rocm6_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1694,35 +1969,36 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_11-rocm5_6-upload: # Uploading + manywheel-py3_11-rocm6_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_11-rocm5_6-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-rocm6_0-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm5_6 + build_name: manywheel-py3_11-rocm6_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_11-rocm5_7-build: + manywheel-py3_11-rocm6_1-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -1731,18 +2007,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm5_7 + build_name: manywheel-py3_11-rocm6_1 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_11-rocm5_7-test: # Testing + manywheel-py3_11-rocm6_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_11-rocm5_7-build + needs: manywheel-py3_11-rocm6_1-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -1751,11 +2027,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.11" steps: - name: Setup ROCm @@ -1763,7 +2039,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_11-rocm5_7 + name: manywheel-py3_11-rocm6_1 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1796,30 +2072,31 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_11-rocm5_7-upload: # Uploading + manywheel-py3_11-rocm6_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_11-rocm5_7-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_11-rocm6_1-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.11" - build_name: manywheel-py3_11-rocm5_7 + build_name: manywheel-py3_11-rocm6_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1862,6 +2139,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_12-cpu-test with: PYTORCH_ROOT: /pytorch @@ -1876,8 +2156,6 @@ jobs: build_name: manywheel-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1922,6 +2200,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cpu-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_12-cpu-cxx11-abi-test with: PYTORCH_ROOT: /pytorch @@ -1937,8 +2218,6 @@ jobs: build_name: manywheel-py3_12-cpu-cxx11-abi secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1959,7 +2238,7 @@ jobs: DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda11_8 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu11==11.8.89; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu11==11.8.87; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu11==8.7.0.84; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu11==11.11.3.6; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu11==10.9.0.58; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu11==10.3.0.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu11==11.4.1.48; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu11==11.7.5.86; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu11==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu11==11.8.86; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda11_8-test: # Testing @@ -1984,6 +2263,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_12-cuda11_8-test with: PYTORCH_ROOT: /pytorch @@ -1999,8 +2281,6 @@ jobs: build_name: manywheel-py3_12-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -2021,7 +2301,7 @@ jobs: DESIRED_PYTHON: "3.12" build_name: manywheel-py3_12-cuda12_1 build_environment: linux-binary-manywheel - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' secrets: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_1-test: # Testing @@ -2046,6 +2326,9 @@ jobs: github-token: ${{ secrets.GITHUB_TOKEN }} manywheel-py3_12-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: manywheel-py3_12-cuda12_1-test with: PYTORCH_ROOT: /pytorch @@ -2061,13 +2344,74 @@ jobs: build_name: manywheel-py3_12-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-rocm5_6-build: + manywheel-py3_12-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + uses: ./.github/workflows/_binary-build-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_4 + build_environment: linux-binary-manywheel + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.7.29; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.4.2.65; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.2.0.44; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.5.119; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.6.0.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.3.0.142; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvjitlink-cu12==12.4.99; platform_system == 'Linux' and platform_machine == 'x86_64' + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: manywheel-py3_12-cuda12_4-build + uses: ./.github/workflows/_binary-test-linux.yml + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_4 + build_environment: linux-binary-manywheel + runs_on: linux.4xlarge.nvidia.gpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + manywheel-py3_12-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-cuda12_4-test + with: + PYTORCH_ROOT: /pytorch + BUILDER_ROOT: /builder + PACKAGE_TYPE: manywheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DOCKER_IMAGE: pytorch/manylinux-builder:cuda12.4-main + DESIRED_PYTHON: "3.12" + build_name: manywheel-py3_12-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + + manywheel-py3_12-rocm6_0-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -2076,18 +2420,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm5_6 + build_name: manywheel-py3_12-rocm6_0 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-rocm5_6-test: # Testing + manywheel-py3_12-rocm6_0-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_12-rocm5_6-build + needs: manywheel-py3_12-rocm6_0-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -2096,11 +2440,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.12" steps: - name: Setup ROCm @@ -2108,7 +2452,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_12-rocm5_6 + name: manywheel-py3_12-rocm6_0 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2141,35 +2485,36 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.6-main + docker-image: pytorch/manylinux-builder:rocm6.0-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_12-rocm5_6-upload: # Uploading + manywheel-py3_12-rocm6_0-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_12-rocm5_6-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-rocm6_0-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.6 - GPU_ARCH_VERSION: 5.6 + DESIRED_CUDA: rocm6.0 + GPU_ARCH_VERSION: 6.0 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.6-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.0-main DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm5_6 + build_name: manywheel-py3_12-rocm6_0 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - manywheel-py3_12-rocm5_7-build: + manywheel-py3_12-rocm6_1-build: if: ${{ github.repository_owner == 'pytorch' }} uses: ./.github/workflows/_binary-build-linux.yml with: @@ -2178,18 +2523,18 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm5_7 + build_name: manywheel-py3_12-rocm6_1 build_environment: linux-binary-manywheel secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - manywheel-py3_12-rocm5_7-test: # Testing + manywheel-py3_12-rocm6_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_12-rocm5_7-build + needs: manywheel-py3_12-rocm6_1-build runs-on: linux.rocm.gpu timeout-minutes: 240 env: @@ -2198,11 +2543,11 @@ jobs: PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm SKIP_ALL_TESTS: 1 - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.12" steps: - name: Setup ROCm @@ -2210,7 +2555,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: manywheel-py3_12-rocm5_7 + name: manywheel-py3_12-rocm6_1 path: "${{ runner.temp }}/artifacts/" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2243,30 +2588,31 @@ jobs: - name: Pull Docker image uses: pytorch/test-infra/.github/actions/pull-docker-image@main with: - docker-image: pytorch/manylinux-builder:rocm5.7-main + docker-image: pytorch/manylinux-builder:rocm6.1-main - name: Test Pytorch binary uses: ./pytorch/.github/actions/test-pytorch-binary - name: Teardown ROCm uses: ./.github/actions/teardown-rocm - manywheel-py3_12-rocm5_7-upload: # Uploading + manywheel-py3_12-rocm6_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: manywheel-py3_12-rocm5_7-test + permissions: + id-token: write + contents: read + needs: manywheel-py3_12-rocm6_1-test with: PYTORCH_ROOT: /pytorch BUILDER_ROOT: /builder PACKAGE_TYPE: manywheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: rocm5.7 - GPU_ARCH_VERSION: 5.7 + DESIRED_CUDA: rocm6.1 + GPU_ARCH_VERSION: 6.1 GPU_ARCH_TYPE: rocm - DOCKER_IMAGE: pytorch/manylinux-builder:rocm5.7-main + DOCKER_IMAGE: pytorch/manylinux-builder:rocm6.1-main DESIRED_PYTHON: "3.12" - build_name: manywheel-py3_12-rocm5_7 + build_name: manywheel-py3_12-rocm6_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml index 40f0ece4ff4a2..a8cbdb7cd6feb 100644 --- a/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-conda-nightly.yml @@ -26,9 +26,7 @@ env: BUILD_ENVIRONMENT: macos-arm64-binary-conda GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 - CROSS_COMPILE_ARM64: 1 - + SKIP_ALL_TESTS: 0 concurrency: group: macos-arm64-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -36,7 +34,7 @@ concurrency: jobs: conda-py3_8-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl + runs-on: macos-13-xlarge timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -131,6 +129,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" conda-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cpu-build with: PYTORCH_ROOT: /pytorch @@ -146,14 +147,12 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml conda-py3_9-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl + runs-on: macos-13-xlarge timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -248,6 +247,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" conda-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cpu-build with: PYTORCH_ROOT: /pytorch @@ -263,14 +265,12 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml conda-py3_10-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl + runs-on: macos-13-xlarge timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -365,6 +365,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" conda-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_10-cpu-build with: PYTORCH_ROOT: /pytorch @@ -380,14 +383,12 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml conda-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl + runs-on: macos-13-xlarge timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -482,6 +483,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" conda-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_11-cpu-build with: PYTORCH_ROOT: /pytorch @@ -497,14 +501,12 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml conda-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl + runs-on: macos-13-xlarge timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -599,6 +601,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" conda-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_12-cpu-build with: PYTORCH_ROOT: /pytorch @@ -614,8 +619,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml index 7a7e7d563ae80..0ed7ba10a07d5 100644 --- a/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-libtorch-cxx11-abi-nightly.yml @@ -26,7 +26,7 @@ env: BUILD_ENVIRONMENT: macos-arm64-binary-libtorch-cxx11-abi GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: macos-arm64-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -133,6 +133,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cpu-shared-with-deps-cxx11-abi-build with: PYTORCH_ROOT: /pytorch @@ -149,8 +152,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml index a0114ebe2f75b..167161de3645c 100644 --- a/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml +++ b/.github/workflows/generated-macos-arm64-binary-wheel-nightly.yml @@ -26,7 +26,7 @@ env: BUILD_ENVIRONMENT: macos-arm64-binary-wheel GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 + SKIP_ALL_TESTS: 0 concurrency: group: macos-arm64-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true @@ -46,7 +46,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -130,6 +130,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" wheel-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_8-cpu-build with: PYTORCH_ROOT: /pytorch @@ -145,8 +148,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -164,7 +165,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -248,6 +249,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" wheel-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_9-cpu-build with: PYTORCH_ROOT: /pytorch @@ -263,8 +267,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -282,7 +284,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -366,6 +368,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" wheel-py3_10-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_10-cpu-build with: PYTORCH_ROOT: /pytorch @@ -381,8 +386,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -400,7 +403,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -484,6 +487,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" wheel-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_11-cpu-build with: PYTORCH_ROOT: /pytorch @@ -499,8 +505,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -518,7 +522,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' # For sccache access (only on non-forked PRs) AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} @@ -602,6 +606,9 @@ jobs: path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_12-cpu-build with: PYTORCH_ROOT: /pytorch @@ -617,8 +624,6 @@ jobs: use_s3: False secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-binary-conda-nightly.yml b/.github/workflows/generated-macos-binary-conda-nightly.yml deleted file mode 100644 index 984b54cd19a43..0000000000000 --- a/.github/workflows/generated-macos-binary-conda-nightly.yml +++ /dev/null @@ -1,619 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-conda - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_conda/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-conda - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-conda-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - conda-py3_8-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: conda-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_8-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_8-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu-main - DESIRED_PYTHON: "3.8" - build_name: conda-py3_8-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - conda-py3_9-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: conda-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_9-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_9-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu-main - DESIRED_PYTHON: "3.9" - build_name: conda-py3_9-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - conda-py3_10-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: conda-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_10-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu-main - DESIRED_PYTHON: "3.10" - build_name: conda-py3_10-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - conda-py3_11-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: conda-py3_11-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_11-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu-main - DESIRED_PYTHON: "3.11" - build_name: conda-py3_11-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - conda-py3_12-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: conda-py3_12-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - conda-py3_12-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: conda - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/conda-builder:cpu-main - DESIRED_PYTHON: "3.12" - build_name: conda-py3_12-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml b/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml deleted file mode 100644 index 1e0a7bfbe84dd..0000000000000 --- a/.github/workflows/generated-macos-binary-libtorch-cxx11-abi-nightly.yml +++ /dev/null @@ -1,156 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-libtorch-cxx11-abi - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_libtorch/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-libtorch-cxx11-abi - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-libtorch-cxx11-abi-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - libtorch-cpu-shared-with-deps-cxx11-abi-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - # This is a dummy value for libtorch to work correctly with our batch scripts - # without this value pip does not get installed for some reason - DESIRED_PYTHON: "3.8" - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: libtorch-cpu-shared-with-deps-cxx11-abi - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - libtorch-cpu-shared-with-deps-cxx11-abi-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: libtorch-cpu-shared-with-deps-cxx11-abi-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: libtorch - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/libtorch-cxx11-builder:cpu-main - LIBTORCH_VARIANT: shared-with-deps - DESIRED_DEVTOOLSET: cxx11-abi - build_name: libtorch-cpu-shared-with-deps-cxx11-abi - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-macos-binary-wheel-nightly.yml b/.github/workflows/generated-macos-binary-wheel-nightly.yml deleted file mode 100644 index fc5bc266e2c51..0000000000000 --- a/.github/workflows/generated-macos-binary-wheel-nightly.yml +++ /dev/null @@ -1,624 +0,0 @@ -# @generated DO NOT EDIT MANUALLY - -# Template is at: .github/templates/macos_binary_build_workflow.yml.j2 -# Generation script: .github/scripts/generate_ci_workflows.py -name: macos-binary-wheel - -on: -# TODO: Migrate to new ciflow trigger, reference https://github.com/pytorch/pytorch/pull/70321 - push: - # NOTE: Meta Employees can trigger new nightlies using: https://fburl.com/trigger_pytorch_nightly_build - branches: - - nightly - tags: - # NOTE: Binary build pipelines should only get triggered on release candidate builds - # Release candidate tags look like: v1.11.0-rc1 - - v[0-9]+.[0-9]+.[0-9]+-rc[0-9]+ - - 'ciflow/binaries/*' - - 'ciflow/binaries_wheel/*' - workflow_dispatch: - -env: - # Needed for conda builds - ALPINE_IMAGE: "308535385114.dkr.ecr.us-east-1.amazonaws.com/tool/alpine" - ANACONDA_USER: pytorch - AWS_DEFAULT_REGION: us-east-1 - BUILD_ENVIRONMENT: macos-binary-wheel - GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} - PR_NUMBER: ${{ github.event.pull_request.number }} - SKIP_ALL_TESTS: 1 -concurrency: - group: macos-binary-wheel-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true - -jobs: - wheel-py3_8-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.8" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: wheel-py3_8-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_8-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_8-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main - DESIRED_PYTHON: "3.8" - build_name: wheel-py3_8-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - wheel-py3_9-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: wheel-py3_9-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_9-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_9-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main - DESIRED_PYTHON: "3.9" - build_name: wheel-py3_9-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: wheel-py3_10-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_10-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: wheel-py3_11-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_11-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cpu-build: - if: ${{ github.repository_owner == 'pytorch' }} - runs-on: macos-12-xl - timeout-minutes: 240 - env: - PYTORCH_ROOT: ${{ github.workspace }}/pytorch - BUILDER_ROOT: ${{ github.workspace }}/builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' - # For sccache access (only on non-forked PRs) - AWS_ACCESS_KEY_ID: ${{ secrets.MACOS_SCCACHE_S3_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.MACOS_SCCACHE_S3_SECRET_ACCESS_KEY }} - steps: - # NOTE: These environment variables are put here so that they can be applied on every job equally - # They are also here because setting them at a workflow level doesn't give us access to the - # runner.temp variable, which we need. - - name: Populate binary env - shell: bash - run: | - # shellcheck disable=SC2129 - echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" - # shellcheck disable=SC2129 - echo "MAC_PACKAGE_WORK_DIR=${RUNNER_TEMP}" >> "${GITHUB_ENV}" - - name: Install conda and dependencies - run: | - # Install conda, setup-miniconda messes with the path that messes with the ruby stuff we do later on - curl --retry 3 --retry-all-errors -o "${RUNNER_TEMP}/conda.sh" "https://repo.anaconda.com/miniconda/Miniconda3-py310_23.5.2-0-MacOSX-$(uname -m).sh" - chmod +x "${RUNNER_TEMP}/conda.sh" - /bin/bash "${RUNNER_TEMP}/conda.sh" -b -p "${RUNNER_TEMP}/anaconda" - echo "${RUNNER_TEMP}/anaconda/bin" >> "${GITHUB_PATH}" - if [ -d "/Applications/Xcode_14.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_14.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - elif [ -d "/Applications/Xcode_13.3.1.app" ]; then - echo "DEVELOPER_DIR=/Applications/Xcode_13.3.1.app/Contents/Developer" >> "${GITHUB_ENV}" - fi - - name: Checkout PyTorch - uses: malfet/checkout@silent-checkout - with: - ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} - submodules: recursive - path: pytorch - quiet-checkout: true - - name: Clean PyTorch checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: pytorch - - name: Checkout pytorch/builder - uses: malfet/checkout@silent-checkout - with: - ref: main - submodules: recursive - repository: pytorch/builder - path: builder - quiet-checkout: true - - name: Clean pytorch/builder checkout - run: | - # Remove any artifacts from the previous checkouts - git clean -fxd - working-directory: builder - - name: Install sccache (only for non-forked PRs, and pushes to trunk) - uses: nick-fields/retry@v2.8.2 - if: ${{ github.event_name == 'push' || github.event.pull_request.head.repo.full_name == github.repository }} - with: - timeout_minutes: 5 - max_attempts: 3 - retry_wait_seconds: 90 - command: | - sudo curl --retry 3 --retry-all-errors https://s3.amazonaws.com/ossci-macos/sccache_v2.15 --output /usr/local/bin/sccache - sudo chmod +x /usr/local/bin/sccache - echo "SCCACHE_BUCKET=ossci-compiler-cache-circleci-v2" >> "${GITHUB_ENV}" - - name: Populate binary env - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" - - name: Build PyTorch binary - run: | - # shellcheck disable=SC1091 - source "${RUNNER_TEMP}/anaconda/bin/activate" - "${PYTORCH_ROOT}/.circleci/scripts/binary_macos_build.sh" - - uses: actions/upload-artifact@v3 - if: always() - with: - name: wheel-py3_12-cpu - retention-days: 14 - if-no-files-found: error - path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - wheel-py3_12-cpu-upload: # Uploading - if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cpu-build - with: - PYTORCH_ROOT: /pytorch - BUILDER_ROOT: /builder - PACKAGE_TYPE: wheel - # TODO: This is a legacy variable that we eventually want to get rid of in - # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu - DOCKER_IMAGE: pytorch/manylinux-builder:cpu-main - DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cpu - use_s3: False - secrets: - github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} - conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} - conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} - uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-conda-nightly.yml b/.github/workflows/generated-windows-binary-conda-nightly.yml index d87d28c270733..c3e4a038896e7 100644 --- a/.github/workflows/generated-windows-binary-conda-nightly.yml +++ b/.github/workflows/generated-windows-binary-conda-nightly.yml @@ -255,6 +255,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -268,8 +271,6 @@ jobs: build_name: conda-py3_8-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -498,6 +499,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_8-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -512,8 +516,6 @@ jobs: build_name: conda-py3_8-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -742,6 +744,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_8-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_8-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -756,8 +761,251 @@ jobs: build_name: conda-py3_8-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + conda-py3_8-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: conda-py3_8-cuda12_4 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_8-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: conda-py3_8-cuda12_4 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_8-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_8-cuda12_4-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.8" + build_name: conda-py3_8-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -984,6 +1232,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -997,8 +1248,6 @@ jobs: build_name: conda-py3_9-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1227,6 +1476,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_9-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1241,8 +1493,6 @@ jobs: build_name: conda-py3_9-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1471,6 +1721,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 conda-py3_9-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: conda-py3_9-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1485,12 +1738,987 @@ jobs: build_name: conda-py3_9-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_10-cpu-build: + conda-py3_9-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: conda-py3_9-cuda12_4 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_9-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: conda-py3_9-cuda12_4 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_9-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_9-cuda12_4-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.9" + build_name: conda-py3_9-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + conda-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: conda-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cpu-build + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_10-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.10" + build_name: conda-py3_10-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + conda-py3_10-cuda11_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: conda-py3_10-cuda11_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda11_8-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda11_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda11_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_10-cuda11_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: conda-py3_10-cuda11_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + conda-py3_10-cuda12_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: conda-py3_10-cuda12_1 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda12_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: conda-py3_10-cuda12_1-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: conda-py3_10-cuda12_1 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + conda-py3_10-cuda12_1-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: conda-py3_10-cuda12_1-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: conda + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: conda-py3_10-cuda12_1 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + conda-py3_10-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1500,8 +2728,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: @@ -1584,7 +2813,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_10-cpu + name: conda-py3_10-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1601,10 +2830,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cpu-test: # Testing + conda-py3_10-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: conda-py3_10-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1612,8 +2841,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: @@ -1663,7 +2893,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_10-cpu + name: conda-py3_10-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1711,27 +2941,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cpu-upload: # Uploading + conda-py3_10-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cpu-test + permissions: + id-token: write + contents: read + needs: conda-py3_10-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: conda-py3_10-cpu + build_name: conda-py3_10-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_10-cuda11_8-build: + conda-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1741,11 +2973,10 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -1826,7 +3057,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_10-cuda11_8 + name: conda-py3_11-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1843,10 +3074,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cuda11_8-test: # Testing + conda-py3_11-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_8-build - runs-on: windows.8xlarge.nvidia.gpu + needs: conda-py3_11-cpu-build + runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1854,11 +3085,10 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -1906,7 +3136,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_10-cuda11_8 + name: conda-py3_11-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1954,28 +3184,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cuda11_8-upload: # Uploading + conda-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda11_8-test + permissions: + id-token: write + contents: read + needs: conda-py3_11-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: conda-py3_10-cuda11_8 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.11" + build_name: conda-py3_11-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_10-cuda12_1-build: + conda-py3_11-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1985,11 +3215,11 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -2070,7 +3300,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_10-cuda12_1 + name: conda-py3_11-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2087,9 +3317,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cuda12_1-test: # Testing + conda-py3_11-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda12_1-build + needs: conda-py3_11-cuda11_8-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -2098,11 +3328,11 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -2150,7 +3380,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_10-cuda12_1 + name: conda-py3_11-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2198,28 +3428,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_10-cuda12_1-upload: # Uploading + conda-py3_11-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_10-cuda12_1-test + permissions: + id-token: write + contents: read + needs: conda-py3_11-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: conda-py3_10-cuda12_1 + DESIRED_PYTHON: "3.11" + build_name: conda-py3_11-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_11-cpu-build: + conda-py3_11-cuda12_1-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2229,8 +3460,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" steps: @@ -2313,7 +3545,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_11-cpu + name: conda-py3_11-cuda12_1 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2330,10 +3562,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cpu-test: # Testing + conda-py3_11-cuda12_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: conda-py3_11-cuda12_1-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -2341,8 +3573,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" steps: @@ -2392,7 +3625,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_11-cpu + name: conda-py3_11-cuda12_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2440,27 +3673,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cpu-upload: # Uploading + conda-py3_11-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cpu-test + permissions: + id-token: write + contents: read + needs: conda-py3_11-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: conda-py3_11-cpu + build_name: conda-py3_11-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_11-cuda11_8-build: + conda-py3_11-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2470,8 +3705,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -2555,7 +3790,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_11-cuda11_8 + name: conda-py3_11-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2572,9 +3807,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cuda11_8-test: # Testing + conda-py3_11-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cuda11_8-build + needs: conda-py3_11-cuda12_4-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -2583,8 +3818,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -2635,7 +3870,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_11-cuda11_8 + name: conda-py3_11-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2683,28 +3918,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cuda11_8-upload: # Uploading + conda-py3_11-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cuda11_8-test + permissions: + id-token: write + contents: read + needs: conda-py3_11-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: conda-py3_11-cuda11_8 + build_name: conda-py3_11-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_11-cuda12_1-build: + conda-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2714,11 +3950,10 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2799,7 +4034,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_11-cuda12_1 + name: conda-py3_12-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2816,10 +4051,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cuda12_1-test: # Testing + conda-py3_12-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cuda12_1-build - runs-on: windows.8xlarge.nvidia.gpu + needs: conda-py3_12-cpu-build + runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -2827,11 +4062,10 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2879,7 +4113,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_11-cuda12_1 + name: conda-py3_12-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2927,28 +4161,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_11-cuda12_1-upload: # Uploading + conda-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_11-cuda12_1-test + permissions: + id-token: write + contents: read + needs: conda-py3_12-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: conda-py3_11-cuda12_1 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.12" + build_name: conda-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_12-cpu-build: + conda-py3_12-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2958,8 +4192,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: @@ -3042,7 +4277,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_12-cpu + name: conda-py3_12-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3059,10 +4294,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cpu-test: # Testing + conda-py3_12-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: conda-py3_12-cuda11_8-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -3070,8 +4305,9 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: @@ -3121,7 +4357,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_12-cpu + name: conda-py3_12-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3169,27 +4405,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cpu-upload: # Uploading + conda-py3_12-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cpu-test + permissions: + id-token: write + contents: read + needs: conda-py3_12-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: conda-py3_12-cpu + build_name: conda-py3_12-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_12-cuda11_8-build: + conda-py3_12-cuda12_1-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -3199,8 +4437,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3284,7 +4522,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_12-cuda11_8 + name: conda-py3_12-cuda12_1 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3301,9 +4539,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cuda11_8-test: # Testing + conda-py3_12-cuda12_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cuda11_8-build + needs: conda-py3_12-cuda12_1-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -3312,8 +4550,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3364,7 +4602,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_12-cuda11_8 + name: conda-py3_12-cuda12_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3412,28 +4650,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cuda11_8-upload: # Uploading + conda-py3_12-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cuda11_8-test + permissions: + id-token: write + contents: read + needs: conda-py3_12-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: conda-py3_12-cuda11_8 + build_name: conda-py3_12-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - conda-py3_12-cuda12_1-build: + conda-py3_12-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -3443,8 +4682,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3528,7 +4767,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: conda-py3_12-cuda12_1 + name: conda-py3_12-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3545,9 +4784,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cuda12_1-test: # Testing + conda-py3_12-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cuda12_1-build + needs: conda-py3_12-cuda12_4-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -3556,8 +4795,8 @@ jobs: PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3608,7 +4847,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: conda-py3_12-cuda12_1 + name: conda-py3_12-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3656,24 +4895,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - conda-py3_12-cuda12_1-upload: # Uploading + conda-py3_12-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: conda-py3_12-cuda12_1-test + permissions: + id-token: write + contents: read + needs: conda-py3_12-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: conda # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: conda-py3_12-cuda12_1 + build_name: conda-py3_12-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml index 7b94603344e38..60ba59556926f 100644 --- a/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-debug-nightly.yml @@ -263,6 +263,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cpu-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cpu-shared-with-deps-debug-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -280,8 +283,6 @@ jobs: build_name: libtorch-cpu-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -518,6 +519,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cuda11_8-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda11_8-shared-with-deps-debug-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -536,8 +540,6 @@ jobs: build_name: libtorch-cuda11_8-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -774,6 +776,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cuda12_1-shared-with-deps-debug-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda12_1-shared-with-deps-debug-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -792,8 +797,263 @@ jobs: build_name: libtorch-cuda12_1-shared-with-deps-debug secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + libtorch-cuda12_4-shared-with-deps-debug-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: libtorch-cuda12_4-shared-with-deps-debug + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_4-shared-with-deps-debug-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda12_4-shared-with-deps-debug-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda12_4-shared-with-deps-debug + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_4-shared-with-deps-debug-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_4-shared-with-deps-debug-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + LIBTORCH_CONFIG: debug + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + build_name: libtorch-cuda12_4-shared-with-deps-debug + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml index 7f13d5a2f5f88..842de97a1fbe9 100644 --- a/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml +++ b/.github/workflows/generated-windows-binary-libtorch-release-nightly.yml @@ -263,6 +263,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cpu-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cpu-shared-with-deps-release-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -280,8 +283,6 @@ jobs: build_name: libtorch-cpu-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -518,6 +519,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cuda11_8-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda11_8-shared-with-deps-release-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -536,8 +540,6 @@ jobs: build_name: libtorch-cuda11_8-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -774,6 +776,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 libtorch-cuda12_1-shared-with-deps-release-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: libtorch-cuda12_1-shared-with-deps-release-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -792,8 +797,263 @@ jobs: build_name: libtorch-cuda12_1-shared-with-deps-release secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + libtorch-cuda12_4-shared-with-deps-release-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: libtorch-cuda12_4-shared-with-deps-release + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_4-shared-with-deps-release-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: libtorch-cuda12_4-shared-with-deps-release-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: libtorch-cuda12_4-shared-with-deps-release + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + libtorch-cuda12_4-shared-with-deps-release-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: libtorch-cuda12_4-shared-with-deps-release-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: libtorch + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + LIBTORCH_CONFIG: release + LIBTORCH_VARIANT: shared-with-deps + # This is a dummy value for libtorch to work correctly with our batch scripts + # without this value pip does not get installed for some reason + DESIRED_PYTHON: "3.8" + build_name: libtorch-cuda12_4-shared-with-deps-release + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/generated-windows-binary-wheel-nightly.yml b/.github/workflows/generated-windows-binary-wheel-nightly.yml index 69917c03d4ce0..d64c221e7895f 100644 --- a/.github/workflows/generated-windows-binary-wheel-nightly.yml +++ b/.github/workflows/generated-windows-binary-wheel-nightly.yml @@ -46,7 +46,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -256,6 +256,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_8-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_8-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -269,8 +272,6 @@ jobs: build_name: wheel-py3_8-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -289,7 +290,7 @@ jobs: GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -500,6 +501,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_8-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_8-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -514,8 +518,6 @@ jobs: build_name: wheel-py3_8-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -534,7 +536,7 @@ jobs: GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.8" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -745,6 +747,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_8-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_8-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -759,8 +764,252 @@ jobs: build_name: wheel-py3_8-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_8-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: wheel-py3_8-cuda12_4 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_8-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.8" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: wheel-py3_8-cuda12_4 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_8-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_8-cuda12_4-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.8" + build_name: wheel-py3_8-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -778,7 +1027,7 @@ jobs: GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -988,6 +1237,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_9-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_9-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1001,8 +1253,6 @@ jobs: build_name: wheel-py3_9-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1021,7 +1271,7 @@ jobs: GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -1232,6 +1482,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_9-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_9-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1246,8 +1499,6 @@ jobs: build_name: wheel-py3_9-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml @@ -1266,7 +1517,7 @@ jobs: GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.9" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -1477,6 +1728,9 @@ jobs: .github\scripts\kill_active_ssh_sessions.ps1 wheel-py3_9-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read needs: wheel-py3_9-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1491,12 +1745,991 @@ jobs: build_name: wheel-py3_9-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cpu-build: + wheel-py3_9-cuda12_4-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: wheel-py3_9-cuda12_4 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda12_4-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_9-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.9" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: wheel-py3_9-cuda12_4 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_9-cuda12_4-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_9-cuda12_4-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.9" + build_name: wheel-py3_9-cuda12_4 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cpu-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: wheel-py3_10-cpu + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cpu-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cpu-build + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cpu + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cpu-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cpu-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cpu + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda11_8-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: wheel-py3_10-cuda11_8 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_8-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda11_8-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda11_8 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda11_8-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda11_8-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda11_8 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda12_1-build: + if: ${{ github.repository_owner == 'pytorch' }} + runs-on: windows.4xlarge.nonephemeral + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Build PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_build.sh" + - uses: actions/upload-artifact@v3 + if: always() + with: + name: wheel-py3_10-cuda12_1 + retention-days: 14 + if-no-files-found: error + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda12_1-test: # Testing + if: ${{ github.repository_owner == 'pytorch' }} + needs: wheel-py3_10-cuda12_1-build + runs-on: windows.8xlarge.nvidia.gpu + timeout-minutes: 240 + env: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + SKIP_ALL_TESTS: 1 + DESIRED_PYTHON: "3.10" + steps: + - name: Display EC2 information + shell: bash + run: | + set -euo pipefail + function get_ec2_metadata() { + # Pulled from instance metadata endpoint for EC2 + # see https://docs.aws.amazon.com/AWSEC2/latest/UserGuide/instancedata-data-retrieval.html + category=$1 + curl -fsSL "http://169.254.169.254/latest/meta-data/${category}" + } + echo "ami-id: $(get_ec2_metadata ami-id)" + echo "instance-id: $(get_ec2_metadata instance-id)" + echo "instance-type: $(get_ec2_metadata instance-type)" + echo "system info $(uname -a)" + - name: "[FB EMPLOYEES] Enable SSH (Click me for login details)" + uses: pytorch/test-infra/.github/actions/setup-ssh@main + continue-on-error: true + with: + github-secret: ${{ secrets.GITHUB_TOKEN }} + # Needed for binary builds, see: https://github.com/pytorch/pytorch/issues/73339#issuecomment-1058981560 + - name: Enable long paths on Windows + shell: powershell + run: | + Set-ItemProperty -Path "HKLM:\\SYSTEM\CurrentControlSet\Control\FileSystem" -Name "LongPathsEnabled" -Value 1 + # Since it's just a defensive command, the workflow should continue even the command fails. This step can be + # removed once Windows Defender is removed from the AMI + - name: Disables Windows Defender scheduled and real-time scanning for files in directories used by PyTorch + continue-on-error: true + shell: powershell + run: | + Add-MpPreference -ExclusionPath $(Get-Location).tostring(),$Env:TEMP -ErrorAction Ignore + # Let's both exclude the path and disable Windows Defender completely just to be sure + # that it doesn't interfere + Set-MpPreference -DisableRealtimeMonitoring $True -ErrorAction Ignore + # NOTE: These environment variables are put here so that they can be applied on every job equally + # They are also here because setting them at a workflow level doesn't give us access to the + # runner.temp variable, which we need. + - name: Populate binary env + shell: bash + run: | + echo "BINARY_ENV_FILE=${RUNNER_TEMP}/env" >> "${GITHUB_ENV}" + echo "PYTORCH_FINAL_PACKAGE_DIR=${RUNNER_TEMP}/artifacts" >> "${GITHUB_ENV}" + echo "WIN_PACKAGE_WORK_DIR=${RUNNER_TEMP}" + - uses: actions/download-artifact@v3 + name: Download Build Artifacts + with: + name: wheel-py3_10-cuda12_1 + path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" + - name: Checkout PyTorch + uses: malfet/checkout@silent-checkout + with: + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + submodules: recursive + path: pytorch + quiet-checkout: true + - name: Clean PyTorch checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: pytorch + - name: Checkout pytorch/builder + uses: malfet/checkout@silent-checkout + with: + ref: main + submodules: recursive + repository: pytorch/builder + path: builder + quiet-checkout: true + - name: Clean pytorch/builder checkout + run: | + # Remove any artifacts from the previous checkouts + git clean -fxd + working-directory: builder + - name: Populate binary env + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_populate_env.sh" + - name: Test PyTorch binary + shell: bash + run: | + "${PYTORCH_ROOT}/.circleci/scripts/binary_windows_test.sh" + - name: Wait until all sessions have drained + shell: powershell + working-directory: pytorch + if: always() + timeout-minutes: 120 + run: | + .github\scripts\wait_for_ssh_to_drain.ps1 + - name: Kill active ssh sessions if still around (Useful if workflow was cancelled) + shell: powershell + working-directory: pytorch + if: always() + run: | + .github\scripts\kill_active_ssh_sessions.ps1 + wheel-py3_10-cuda12_1-upload: # Uploading + if: ${{ github.repository_owner == 'pytorch' }} + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda12_1-test + with: + PYTORCH_ROOT: ${{ github.workspace }}/pytorch + BUILDER_ROOT: ${{ github.workspace }}/builder + PACKAGE_TYPE: wheel + # TODO: This is a legacy variable that we eventually want to get rid of in + # favor of GPU_ARCH_VERSION + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda + DESIRED_PYTHON: "3.10" + build_name: wheel-py3_10-cuda12_1 + secrets: + github-token: ${{ secrets.GITHUB_TOKEN }} + conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} + conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} + uses: ./.github/workflows/_binary-upload.yml + wheel-py3_10-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1506,11 +2739,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -1591,7 +2825,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_10-cpu + name: wheel-py3_10-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1608,10 +2842,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-test: # Testing + wheel-py3_10-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: wheel-py3_10-cuda12_4-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1619,8 +2853,9 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.10" steps: @@ -1670,7 +2905,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_10-cpu + name: wheel-py3_10-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1718,27 +2953,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cpu-upload: # Uploading + wheel-py3_10-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cpu-test + permissions: + id-token: write + contents: read + needs: wheel-py3_10-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cpu + build_name: wheel-py3_10-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda11_8-build: + wheel-py3_11-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1748,12 +2985,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + DESIRED_PYTHON: "3.11" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -1834,7 +3070,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_10-cuda11_8 + name: wheel-py3_11-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -1851,10 +3087,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_8-test: # Testing + wheel-py3_11-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_8-build - runs-on: windows.8xlarge.nvidia.gpu + needs: wheel-py3_11-cpu-build + runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -1862,11 +3098,10 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -1914,7 +3149,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_10-cuda11_8 + name: wheel-py3_11-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -1962,28 +3197,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda11_8-upload: # Uploading + wheel-py3_11-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda11_8-test + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda11_8 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_10-cuda12_1-build: + wheel-py3_11-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -1993,12 +3228,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + DESIRED_PYTHON: "3.11" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -2079,7 +3314,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_10-cuda12_1 + name: wheel-py3_11-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2096,9 +3331,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_1-test: # Testing + wheel-py3_11-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda12_1-build + needs: wheel-py3_11-cuda11_8-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -2107,11 +3342,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.10" + DESIRED_PYTHON: "3.11" steps: - name: Display EC2 information shell: bash @@ -2159,7 +3394,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_10-cuda12_1 + name: wheel-py3_11-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2207,28 +3442,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_10-cuda12_1-upload: # Uploading + wheel-py3_11-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_10-cuda12_1-test + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.10" - build_name: wheel-py3_10-cuda12_1 + DESIRED_PYTHON: "3.11" + build_name: wheel-py3_11-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cpu-build: + wheel-py3_11-cuda12_1-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2238,11 +3474,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -2323,7 +3560,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_11-cpu + name: wheel-py3_11-cuda12_1 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2340,10 +3577,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-test: # Testing + wheel-py3_11-cuda12_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: wheel-py3_11-cuda12_1-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -2351,8 +3588,9 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" steps: @@ -2402,7 +3640,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_11-cpu + name: wheel-py3_11-cuda12_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2450,27 +3688,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cpu-upload: # Uploading + wheel-py3_11-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cpu-test + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cpu + build_name: wheel-py3_11-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda11_8-build: + wheel-py3_11-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2480,12 +3720,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -2566,7 +3806,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_11-cuda11_8 + name: wheel-py3_11-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2583,9 +3823,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda11_8-test: # Testing + wheel-py3_11-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cuda11_8-build + needs: wheel-py3_11-cuda12_4-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -2594,8 +3834,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.11" @@ -2646,7 +3886,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_11-cuda11_8 + name: wheel-py3_11-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2694,28 +3934,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda11_8-upload: # Uploading + wheel-py3_11-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cuda11_8-test + permissions: + id-token: write + contents: read + needs: wheel-py3_11-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda11_8 + build_name: wheel-py3_11-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_11-cuda12_1-build: + wheel-py3_12-cpu-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2725,12 +3966,11 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + DESIRED_PYTHON: "3.12" + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -2811,7 +4051,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_11-cuda12_1 + name: wheel-py3_12-cpu retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -2828,10 +4068,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_1-test: # Testing + wheel-py3_12-cpu-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cuda12_1-build - runs-on: windows.8xlarge.nvidia.gpu + needs: wheel-py3_12-cpu-build + runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -2839,11 +4079,10 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu SKIP_ALL_TESTS: 1 - DESIRED_PYTHON: "3.11" + DESIRED_PYTHON: "3.12" steps: - name: Display EC2 information shell: bash @@ -2891,7 +4130,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_11-cuda12_1 + name: wheel-py3_12-cpu path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -2939,28 +4178,28 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_11-cuda12_1-upload: # Uploading + wheel-py3_12-cpu-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_11-cuda12_1-test + permissions: + id-token: write + contents: read + needs: wheel-py3_12-cpu-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 - GPU_ARCH_TYPE: cuda - DESIRED_PYTHON: "3.11" - build_name: wheel-py3_11-cuda12_1 + DESIRED_CUDA: cpu + GPU_ARCH_TYPE: cpu + DESIRED_PYTHON: "3.12" + build_name: wheel-py3_12-cpu secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cpu-build: + wheel-py3_12-cuda11_8-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -2970,11 +4209,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -3055,7 +4295,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_12-cpu + name: wheel-py3_12-cuda11_8 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3072,10 +4312,10 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-test: # Testing + wheel-py3_12-cuda11_8-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cpu-build - runs-on: windows.4xlarge.nonephemeral + needs: wheel-py3_12-cuda11_8-build + runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: PYTORCH_ROOT: ${{ github.workspace }}/pytorch @@ -3083,8 +4323,9 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" steps: @@ -3134,7 +4375,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_12-cpu + name: wheel-py3_12-cuda11_8 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3182,27 +4423,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cpu-upload: # Uploading + wheel-py3_12-cuda11_8-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cpu-test + permissions: + id-token: write + contents: read + needs: wheel-py3_12-cuda11_8-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cpu - GPU_ARCH_TYPE: cpu + DESIRED_CUDA: cu118 + GPU_ARCH_VERSION: 11.8 + GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cpu + build_name: wheel-py3_12-cuda11_8 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda11_8-build: + wheel-py3_12-cuda12_1-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -3212,12 +4455,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -3298,7 +4541,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_12-cuda11_8 + name: wheel-py3_12-cuda12_1 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3315,9 +4558,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda11_8-test: # Testing + wheel-py3_12-cuda12_1-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cuda11_8-build + needs: wheel-py3_12-cuda12_1-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -3326,8 +4569,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3378,7 +4621,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_12-cuda11_8 + name: wheel-py3_12-cuda12_1 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3426,28 +4669,29 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda11_8-upload: # Uploading + wheel-py3_12-cuda12_1-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cuda11_8-test + permissions: + id-token: write + contents: read + needs: wheel-py3_12-cuda12_1-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu118 - GPU_ARCH_VERSION: 11.8 + DESIRED_CUDA: cu121 + GPU_ARCH_VERSION: 12.1 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda11_8 + build_name: wheel-py3_12-cuda12_1 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml - wheel-py3_12-cuda12_1-build: + wheel-py3_12-cuda12_4-build: if: ${{ github.repository_owner == 'pytorch' }} runs-on: windows.4xlarge.nonephemeral timeout-minutes: 240 @@ -3457,12 +4701,12 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" - PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.19.3; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' + PYTORCH_EXTRA_INSTALL_REQUIREMENTS: nvidia-cuda-nvrtc-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-runtime-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cuda-cupti-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cudnn-cu12==8.9.2.26; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cublas-cu12==12.1.3.1; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cufft-cu12==11.0.2.54; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-curand-cu12==10.3.2.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusolver-cu12==11.4.5.107; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-cusparse-cu12==12.1.0.106; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nccl-cu12==2.20.5; platform_system == 'Linux' and platform_machine == 'x86_64' | nvidia-nvtx-cu12==12.1.105; platform_system == 'Linux' and platform_machine == 'x86_64' steps: - name: Display EC2 information shell: bash @@ -3543,7 +4787,7 @@ jobs: - uses: actions/upload-artifact@v3 if: always() with: - name: wheel-py3_12-cuda12_1 + name: wheel-py3_12-cuda12_4 retention-days: 14 if-no-files-found: error path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" @@ -3560,9 +4804,9 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_1-test: # Testing + wheel-py3_12-cuda12_4-test: # Testing if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cuda12_1-build + needs: wheel-py3_12-cuda12_4-build runs-on: windows.8xlarge.nvidia.gpu timeout-minutes: 240 env: @@ -3571,8 +4815,8 @@ jobs: PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda SKIP_ALL_TESTS: 1 DESIRED_PYTHON: "3.12" @@ -3623,7 +4867,7 @@ jobs: - uses: actions/download-artifact@v3 name: Download Build Artifacts with: - name: wheel-py3_12-cuda12_1 + name: wheel-py3_12-cuda12_4 path: "${{ env.PYTORCH_FINAL_PACKAGE_DIR }}" - name: Checkout PyTorch uses: malfet/checkout@silent-checkout @@ -3671,24 +4915,25 @@ jobs: if: always() run: | .github\scripts\kill_active_ssh_sessions.ps1 - wheel-py3_12-cuda12_1-upload: # Uploading + wheel-py3_12-cuda12_4-upload: # Uploading if: ${{ github.repository_owner == 'pytorch' }} - needs: wheel-py3_12-cuda12_1-test + permissions: + id-token: write + contents: read + needs: wheel-py3_12-cuda12_4-test with: PYTORCH_ROOT: ${{ github.workspace }}/pytorch BUILDER_ROOT: ${{ github.workspace }}/builder PACKAGE_TYPE: wheel # TODO: This is a legacy variable that we eventually want to get rid of in # favor of GPU_ARCH_VERSION - DESIRED_CUDA: cu121 - GPU_ARCH_VERSION: 12.1 + DESIRED_CUDA: cu124 + GPU_ARCH_VERSION: 12.4 GPU_ARCH_TYPE: cuda DESIRED_PYTHON: "3.12" - build_name: wheel-py3_12-cuda12_1 + build_name: wheel-py3_12-cuda12_4 secrets: github-token: ${{ secrets.GITHUB_TOKEN }} - aws-pytorch-uploader-access-key-id: ${{ secrets.AWS_PYTORCH_UPLOADER_ACCESS_KEY_ID }} - aws-pytorch-uploader-secret-access-key: ${{ secrets.AWS_PYTORCH_UPLOADER_SECRET_ACCESS_KEY }} conda-pytorchbot-token: ${{ secrets.CONDA_PYTORCHBOT_TOKEN }} conda-pytorchbot-token-test: ${{ secrets.CONDA_PYTORCHBOT_TOKEN_TEST }} uses: ./.github/workflows/_binary-upload.yml diff --git a/.github/workflows/inductor-micro-benchmark.yml b/.github/workflows/inductor-micro-benchmark.yml new file mode 100644 index 0000000000000..4fe0ddf50ef2a --- /dev/null +++ b/.github/workflows/inductor-micro-benchmark.yml @@ -0,0 +1,40 @@ +name: inductor-micro-benchmark + +on: + schedule: + - cron: 0 7 * * * + push: + tags: + - ciflow/inductor-micro-benchmark/* + workflow_dispatch: + + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +permissions: read-all + +jobs: + linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build: + name: cuda12.1-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 + docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks + cuda-arch-list: '8.0' + test-matrix: | + { include: [ + { config: "inductor-micro-benchmark", shard: 1, num_shards: 1, runner: "linux.gcp.a100" }, + ]} + + linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-test: + name: cuda12.1-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build + with: + build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 + docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-micro-benchmark-build.outputs.test-matrix }} + use-gha: anything-non-empty-to-use-gha + timeout-minutes: 720 diff --git a/.github/workflows/inductor-perf-compare.yml b/.github/workflows/inductor-perf-compare.yml index 444cf3c428136..e485a8bfce1b7 100644 --- a/.github/workflows/inductor-perf-compare.yml +++ b/.github/workflows/inductor-perf-compare.yml @@ -10,6 +10,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +permissions: read-all + jobs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build: name: cuda12.1-py3.10-gcc9-sm80 diff --git a/.github/workflows/inductor-perf-test-nightly.yml b/.github/workflows/inductor-perf-test-nightly.yml index e8a811d55b6b5..e77c915749f3f 100644 --- a/.github/workflows/inductor-perf-test-nightly.yml +++ b/.github/workflows/inductor-perf-test-nightly.yml @@ -4,15 +4,17 @@ on: schedule: - cron: 0 7 * * 1-6 - cron: 0 7 * * 0 + # NB: GitHub has an upper limit of 10 inputs here, so before we can sort it + # out, let try to run torchao cudagraphs_low_precision as part of cudagraphs workflow_dispatch: inputs: training: - description: Run training? + description: Run training (on by default)? required: false type: boolean default: true inference: - description: Run inference? + description: Run inference (off by default)? required: false type: boolean default: false @@ -20,22 +22,17 @@ on: description: Run inductor_default? required: false type: boolean - default: true + default: false dynamic: description: Run inductor_dynamic_shapes? required: false type: boolean - default: true + default: false cudagraphs: description: Run inductor_cudagraphs? required: false type: boolean default: true - cppwrapper: - description: Run inductor_cpp_wrapper for inference? - required: false - type: boolean - default: false freezing_cudagraphs: description: Run inductor_cudagraphs with freezing for inference? required: false @@ -56,11 +53,18 @@ on: required: false type: boolean default: false + benchmark_configs: + description: The list of configs used the benchmark + required: false + type: string + default: inductor_huggingface_perf,inductor_timm_perf,inductor_torchbench_perf concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +permissions: read-all + jobs: linux-focal-cuda12_1-py3_10-gcc9-inductor-build: name: cuda12.1-py3.10-gcc9-sm80 @@ -84,6 +88,7 @@ jobs: { config: "inductor_torchbench_perf", shard: 3, num_shards: 4, runner: "linux.gcp.a100.large" }, { config: "inductor_torchbench_perf", shard: 4, num_shards: 4, runner: "linux.gcp.a100.large" }, ]} + selected-test-configs: ${{ inputs.benchmark_configs }} secrets: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} @@ -94,7 +99,7 @@ jobs: if: github.event.schedule == '0 7 * * 1-6' with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 - dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-cudagraphs_low_precision-true docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }} use-gha: anything-non-empty-to-use-gha @@ -109,7 +114,7 @@ jobs: if: github.event.schedule == '0 7 * * 0' with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 - dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true + dashboard-tag: training-true-inference-true-default-true-dynamic-true-cudagraphs-true-aotinductor-true-freezing_cudagraphs-true-maxautotune-true-freeze_autotune_cudagraphs-true-cudagraphs_low_precision-true docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }} use-gha: anything-non-empty-to-use-gha @@ -124,7 +129,7 @@ jobs: if: github.event_name == 'workflow_dispatch' with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 - dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-${{ inputs.cppwrapper }}-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }} + dashboard-tag: training-${{ inputs.training }}-inference-${{ inputs.inference }}-default-${{ inputs.default }}-dynamic-${{ inputs.dynamic }}-cudagraphs-${{ inputs.cudagraphs }}-cppwrapper-false-aotinductor-${{ inputs.aotinductor }}-maxautotune-${{ inputs.maxautotune }}-freezing_cudagraphs-${{ inputs.freezing_cudagraphs }}-cudagraphs_low_precision-${{ inputs.cudagraphs }} docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-inductor-build.outputs.test-matrix }} use-gha: anything-non-empty-to-use-gha diff --git a/.github/workflows/inductor-periodic.yml b/.github/workflows/inductor-periodic.yml index f775acf1e9e78..6f8c06ed030b0 100644 --- a/.github/workflows/inductor-periodic.yml +++ b/.github/workflows/inductor-periodic.yml @@ -14,6 +14,9 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true + +permissions: read-all + jobs: linux-focal-cuda12_1-py3_10-gcc9-periodic-dynamo-benchmarks-build: name: cuda12.1-py3.10-gcc9-sm86-periodic-dynamo-benchmarks diff --git a/.github/workflows/inductor.yml b/.github/workflows/inductor.yml index 015b197c2b3cf..0ad799a80bcc0 100644 --- a/.github/workflows/inductor.yml +++ b/.github/workflows/inductor.yml @@ -13,26 +13,31 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +permissions: read-all + jobs: - linux-focal-rocm5_7-py3_8-inductor-build: - name: rocm5.7-py3.8-inductor + linux-focal-rocm6_1-py3_8-inductor-build: + name: rocm6.1-py3.8-inductor uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.7-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 test-matrix: | { include: [ { config: "inductor", shard: 1, num_shards: 1, runner: "linux.rocm.gpu.2" }, ]} - linux-focal-rocm5_7-py3_8-inductor-test: - name: rocm5.7-py3.8-inductor + linux-focal-rocm6_1-py3_8-inductor-test: + permissions: + id-token: write + contents: read + name: rocm6.1-py3.8-inductor uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_7-py3_8-inductor-build + needs: linux-focal-rocm6_1-py3_8-inductor-build with: - build-environment: linux-focal-rocm5.7-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-inductor-build.outputs.test-matrix }} + build-environment: linux-focal-rocm6.1-py3.8 + docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-inductor-build.outputs.test-matrix }} linux-focal-cuda12_1-py3_10-gcc9-inductor-build: name: cuda12.1-py3.10-gcc9-sm86 @@ -60,6 +65,7 @@ jobs: { config: "aot_inductor_timm", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "aot_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, { config: "aot_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "inductor_cpp_wrapper_abi_compatible", shard: 1, num_shards: 1, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} secrets: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} @@ -105,7 +111,7 @@ jobs: name: linux-jammy-cpu-py3.8-gcc11-inductor uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-jammy-py3_8-gcc11-build + build-environment: linux-jammy-py3.8-gcc11-build docker-image-name: pytorch-linux-jammy-py3.8-gcc11-inductor-benchmarks test-matrix: | { include: [ @@ -119,6 +125,7 @@ jobs: { config: "dynamic_cpu_inductor_timm", shard: 2, num_shards: 2, runner: "linux.12xlarge" }, { config: "dynamic_cpu_inductor_torchbench", shard: 1, num_shards: 2, runner: "linux.12xlarge" }, { config: "dynamic_cpu_inductor_torchbench", shard: 2, num_shards: 2, runner: "linux.12xlarge" }, + { config: "inductor_torchbench_cpu_smoketest_perf", shard: 1, num_shards: 1, runner: "linux.12xlarge" }, ]} secrets: HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} @@ -128,7 +135,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: linux-jammy-cpu-py3_8-gcc11-inductor-build with: - build-environment: linux-jammy-py3_8-gcc11-build + build-environment: linux-jammy-py3.8-gcc11-build docker-image: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.docker-image }} test-matrix: ${{ needs.linux-jammy-cpu-py3_8-gcc11-inductor-build.outputs.test-matrix }} secrets: diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml deleted file mode 100644 index f70d715c4d473..0000000000000 --- a/.github/workflows/labeler.yml +++ /dev/null @@ -1,22 +0,0 @@ -name: Labeler - -on: -- pull_request_target - -jobs: - triage: - permissions: - contents: read - pull-requests: write - runs-on: ubuntu-latest - # Do not auto-label nightly builds PR - if: ${{ github.event.pull_request.number != 26921 }} - steps: - - uses: actions/labeler@v4 - with: - repo-token: "${{ secrets.GITHUB_TOKEN }}" - sync-labels: '' - -concurrency: - group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} - cancel-in-progress: true diff --git a/.github/workflows/lint-bc.yml b/.github/workflows/lint-bc.yml index f2d26a0aec9a4..73d7805082026 100644 --- a/.github/workflows/lint-bc.yml +++ b/.github/workflows/lint-bc.yml @@ -1,24 +1,13 @@ name: BC Lint on: - # Copied from check-labels.yml to get around needing approval for first time contributors - # See https://docs.github.com/en/actions/managing-workflow-runs/approving-workflow-runs-from-public-forks - # Only allow pull_request_target when merging to main, not some historical branch. - # - # Make sure to don't introduce explicit checking out and installing/running - # untrusted user code into this workflow! - pull_request_target: - types: [opened, synchronize, reopened, labeled, unlabeled] - branches: [main] - paths-ignore: [.github/workflows/lint-bc.yml] - - # To allow testing PRs that change workflows. - # May be triggered together with pull_request_target, it's OK. pull_request: - types: [opened, synchronize, reopened, labeled, unlabeled] - paths: [.github/workflows/lint-bc.yml] - branches-ignore: [nightly] - + types: + - opened + - synchronize + - reopened + branches-ignore: + - nightly workflow_dispatch: jobs: diff --git a/.github/workflows/lint.yml b/.github/workflows/lint.yml index 90845e82d67b3..f1b6611d00e03 100644 --- a/.github/workflows/lint.yml +++ b/.github/workflows/lint.yml @@ -11,56 +11,40 @@ on: - landchecks/* workflow_dispatch: +permissions: read-all # The names of steps that actually test the code should be suffixed with `(nonretryable)`. # When any other step fails, it's job will be retried once by retryBot. jobs: - lintrunner: + lintrunner-clang: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main with: timeout: 120 runner: linux.2xlarge docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter + # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout + # to run git rev-parse HEAD~:.ci/docker when a new image is needed fetch-depth: 0 + submodules: true ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} script: | - # The generic Linux job chooses to use base env, not the one setup by the image - CONDA_ENV=$(conda env list --json | jq -r ".envs | .[-1]") - conda activate "${CONDA_ENV}" - - CACHE_DIRECTORY="/tmp/.lintbin" - # Try to recover the cached binaries - if [[ -d "${CACHE_DIRECTORY}" ]]; then - # It's ok to fail this as lintrunner init would download these binaries - # again if they do not exist - cp -r "${CACHE_DIRECTORY}" . || true - fi - - # This has already been cached in the docker image - lintrunner init 2> /dev/null - - # Do build steps necessary for linters - python3 -m tools.linter.clang_tidy.generate_build_files - python3 -m tools.generate_torch_version --is_debug=false - python3 -m tools.pyi.gen_pyi \ - --native-functions-path aten/src/ATen/native/native_functions.yaml \ - --tags-path aten/src/ATen/native/tags.yaml \ - --deprecated-functions-path "tools/autograd/deprecated.yaml" - - RC=0 - # Run lintrunner on all files - if ! lintrunner --force-color --all-files --tee-json=lint.json 2> /dev/null; then - echo "" - echo -e "\e[1m\e[36mYou can reproduce these results locally by using \`lintrunner\`. (If you don't get the same results, run \'lintrunner init\' to update your local linter)\e[0m" - echo -e "\e[1m\e[36mSee https://github.com/pytorch/pytorch/wiki/lintrunner for setup instructions.\e[0m" - RC=1 - fi + export ADDITIONAL_LINTRUNNER_ARGS="--take CLANGTIDY,CLANGFORMAT" + export CLANG=1 + .github/scripts/lintrunner.sh - # Use jq to massage the JSON lint output into GitHub Actions workflow commands. - jq --raw-output \ - '"::\(if .severity == "advice" or .severity == "disabled" then "warning" else .severity end) file=\(.path),line=\(.line),col=\(.char),title=\(.code) \(.name)::" + (.description | gsub("\\n"; "%0A"))' \ - lint.json || true - - exit $RC + lintrunner-noclang: + uses: pytorch/test-infra/.github/workflows/linux_job.yml@main + with: + timeout: 120 + runner: linux.2xlarge + docker-image: pytorch-linux-jammy-cuda11.8-cudnn8-py3.9-linter + # NB: A shallow checkout won't work here because calculate-docker-image requires a full checkout + # to run git rev-parse HEAD~:.ci/docker when a new image is needed + fetch-depth: 0 + submodules: true + ref: ${{ github.event_name == 'pull_request' && github.event.pull_request.head.sha || github.sha }} + script: | + export ADDITIONAL_LINTRUNNER_ARGS="--skip CLANGTIDY,CLANGFORMAT" + .github/scripts/lintrunner.sh quick-checks: uses: pytorch/test-infra/.github/workflows/linux_job.yml@main @@ -224,7 +208,7 @@ jobs: cache: pip - name: Install dependencies run: | - pip install pytest-rerunfailures==11.1.* pytest-shard==0.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.* + pip install pytest-rerunfailures==11.1.* pytest-flakefinder==1.1.* pytest-xdist==3.3.* expecttest==0.1.* numpy==1.24.* pip install torch --pre --index-url https://download.pytorch.org/whl/nightly/cpu/ - name: Run run_test.py (nonretryable) run: | @@ -246,11 +230,11 @@ jobs: with: submodules: false fetch-depth: 1 - - name: Setup Python 3.5 + - name: Setup Python 3.6 if: matrix.test_type == 'older_python_version' uses: actions/setup-python@v4 with: - python-version: '3.5' + python-version: '3.6' architecture: x64 check-latest: false cache: pip diff --git a/.github/workflows/linux-aarch64.yml b/.github/workflows/linux-aarch64.yml new file mode 100644 index 0000000000000..acdb6884971b6 --- /dev/null +++ b/.github/workflows/linux-aarch64.yml @@ -0,0 +1,39 @@ +name: linux-aarch64 + +on: + push: + tags: + - ciflow/linux-aarch64/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} but found ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + linux-jammy-aarch64-py3_10-build: + name: linux-jammy-aarch64-py3.10 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-jammy-aarch64-py3.10 + docker-image-name: pytorch-linux-jammy-aarch64-py3.10-gcc11 + runner: linux.arm64.2xlarge + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 4, runner: "linux.arm64.2xlarge" }, + { config: "default", shard: 2, num_shards: 4, runner: "linux.arm64.2xlarge" }, + { config: "default", shard: 3, num_shards: 4, runner: "linux.arm64.2xlarge" }, + { config: "default", shard: 4, num_shards: 4, runner: "linux.arm64.2xlarge" }, + ]} + + linux-jammy-aarch64-py3_10-test: + name: linux-jammy-aarch64-py3.10 + uses: ./.github/workflows/_linux-test.yml + needs: linux-jammy-aarch64-py3_10-build + permissions: + id-token: write + contents: read + with: + build-environment: linux-jammy-aarch64-py3.10 + docker-image: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-aarch64-py3_10-build.outputs.test-matrix }} diff --git a/.github/workflows/llm_td_retrieval.yml b/.github/workflows/llm_td_retrieval.yml new file mode 100644 index 0000000000000..047e8ace0049d --- /dev/null +++ b/.github/workflows/llm_td_retrieval.yml @@ -0,0 +1,120 @@ +name: Retrieval PyTorch Tests for Target Determination + +on: + workflow_call: + +permissions: + id-token: write + contents: read + +jobs: + llm-retrieval: + runs-on: linux.4xlarge + continue-on-error: true + steps: + - name: Clone PyTorch + uses: actions/checkout@v3 + with: + repository: pytorch/pytorch + fetch-depth: 0 + path: pytorch + + - name: Setup Linux + uses: ./pytorch/.github/actions/setup-linux + + - name: Clone CodeLlama + uses: actions/checkout@v3 + with: + repository: osalpekar/codellama + ref: main + path: codellama + + - name: Clone Target Determination Code + uses: actions/checkout@v3 + with: + repository: osalpekar/llm-target-determinator + ref: v0.0.2 + path: llm-target-determinator + + - name: Setup Conda + uses: conda-incubator/setup-miniconda@v2.1.1 + with: + miniconda-version: "py39_4.12.0" + python-version: 3.9 + + - name: Install Requirements + shell: bash -l {0} + run: | + set -euxo pipefail + conda create \ + --yes \ + --quiet \ + --name "tdenv" \ + "python=3.9" + conda activate tdenv + cd "${GITHUB_WORKSPACE}/llm-target-determinator" + pip install -r requirements.txt + cd ../codellama + pip install -e . + + - name: Fetch CodeLlama Checkpoint + shell: bash -l {0} + run: | + set -euxo pipefail + conda activate tdenv + cd codellama/ + mkdir "CodeLlama-7b-Python" + aws s3 cp "s3://target-determinator-assets/CodeLlama-7b-Python" "CodeLlama-7b-Python" --recursive --no-progress + + - name: Fetch indexes + uses: nick-fields/retry@v2.8.2 + with: + max_attempts: 3 + retry_wait_seconds: 10 + timeout_minutes: 5 + shell: bash + command: | + set -euxo pipefail + python3 -m pip install awscli==1.29.40 + cd "${GITHUB_WORKSPACE}"/llm-target-determinator/assets + aws s3 cp "s3://target-determinator-assets/indexes/latest" . --recursive + + unzip -o indexer-files\*.zip + rm indexer-files*.zip + + - name: Run Retriever + id: run_retriever + continue-on-error: true # ghstack not currently supported due to problems getting git diff + shell: bash -l {0} + run: | + set -euxo pipefail + conda activate tdenv + cd "${GITHUB_WORKSPACE}"/llm-target-determinator + torchrun \ + --standalone \ + --nnodes=1 \ + --nproc-per-node=1 \ + retriever.py \ + --experiment-name indexer-files \ + --pr-parse-format GITDIFF + cd assets + zip -r mappings.zip mappings + + - name: Upload results to s3 + uses: seemethere/upload-artifact-s3@v5 + if: ${{ steps.run_retriever.outcome == 'success' }} + with: + name: llm_results + retention-days: 14 + if-no-files-found: warn + path: llm-target-determinator/assets/mappings.zip + env: + AWS_ACCESS_KEY_ID: "" + AWS_SECRET_ACCESS_KEY: "" + AWS_SESSION_TOKEN: "" + AWS_DEFAULT_REGION: "" + AWS_REGION: "" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() diff --git a/.github/workflows/mac-mps.yml b/.github/workflows/mac-mps.yml index d2ec160e07f4b..f57ea0fdd07df 100644 --- a/.github/workflows/mac-mps.yml +++ b/.github/workflows/mac-mps.yml @@ -10,6 +10,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +permissions: read-all + jobs: macos-12-py3-arm64-build: name: macos-12-py3-arm64 @@ -17,7 +19,7 @@ jobs: with: sync-tag: macos-12-py3-arm64-build build-environment: macos-12-py3-arm64 - runner-type: macos-m1-12 + runner-type: macos-m1-stable build-generates-artifacts: true # To match the one pre-installed in the m1 runners python-version: 3.9.12 @@ -27,7 +29,7 @@ jobs: environment-file: .github/requirements/conda-env-macOS-ARM64 test-matrix: | { include: [ - { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-12" }, + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" }, { config: "mps", shard: 1, num_shards: 1, runner: "macos-m2-14" }, ]} diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml index 76c38c032f573..25f71c70e9486 100644 --- a/.github/workflows/nightly.yml +++ b/.github/workflows/nightly.yml @@ -31,7 +31,7 @@ jobs: with: build-environment: linux-jammy-py3.8-gcc11 docker-image: ${{ needs.docs-build.outputs.docker-image }} - push: ${{ github.event_name == 'schedule' || startsWith(github.event.ref, 'refs/tags/v') }} + push: ${{ github.event_name == 'schedule' || github.event_name == 'workflow_dispatch' || startsWith(github.event.ref, 'refs/tags/v') }} run-doxygen: true secrets: GH_PYTORCHBOT_TOKEN: ${{ secrets.GH_PYTORCHBOT_TOKEN }} @@ -40,16 +40,14 @@ jobs: runs-on: ubuntu-latest environment: update-commit-hash steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: update-vision-commit-hash - uses: ./.github/actions/update-commit-hash + uses: pytorch/test-infra/.github/actions/update-commit-hash@main if: ${{ github.event_name == 'schedule' }} with: repo-name: vision branch: main + pin-folder: .github/ci_commit_pins + test-infra-ref: main updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }} pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} @@ -57,16 +55,14 @@ jobs: runs-on: ubuntu-latest environment: update-commit-hash steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: update-audio-commit-hash - uses: ./.github/actions/update-commit-hash + uses: pytorch/test-infra/.github/actions/update-commit-hash@main if: ${{ github.event_name == 'schedule' }} with: repo-name: audio branch: main + pin-folder: .github/ci_commit_pins + test-infra-ref: main updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }} pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} @@ -74,16 +70,13 @@ jobs: runs-on: ubuntu-latest environment: update-commit-hash steps: - - name: Checkout repo - uses: actions/checkout@v3 - with: - fetch-depth: 0 - name: update-executorch-commit-hash - uses: ./.github/actions/update-commit-hash + uses: pytorch/test-infra/.github/actions/update-commit-hash@main if: ${{ github.event_name == 'schedule' }} with: repo-name: executorch branch: main pin-folder: .ci/docker/ci_commit_pins + test-infra-ref: main updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }} pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} diff --git a/.github/workflows/periodic.yml b/.github/workflows/periodic.yml index a6a8c6efffe97..716a72cc6d235 100644 --- a/.github/workflows/periodic.yml +++ b/.github/workflows/periodic.yml @@ -20,7 +20,24 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +permissions: read-all + jobs: + llm-td: + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + parallelnative-linux-jammy-py3_8-gcc11-build: name: parallelnative-linux-jammy-py3.8-gcc11 uses: ./.github/workflows/_linux-build.yml @@ -37,7 +54,9 @@ jobs: parallelnative-linux-jammy-py3_8-gcc11-test: name: parallelnative-linux-jammy-py3.8-gcc11 uses: ./.github/workflows/_linux-test.yml - needs: parallelnative-linux-jammy-py3_8-gcc11-build + needs: + - parallelnative-linux-jammy-py3_8-gcc11-build + - target-determination with: build-environment: parallelnative-linux-jammy-py3.8-gcc11 docker-image: ${{ needs.parallelnative-linux-jammy-py3_8-gcc11-build.outputs.docker-image }} @@ -84,7 +103,9 @@ jobs: linux-focal-cuda11_8-py3_10-gcc9-debug-test: name: linux-focal-cuda11.8-py3.10-gcc9-debug uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda11_8-py3_10-gcc9-debug-build + needs: + - linux-focal-cuda11_8-py3_10-gcc9-debug-build + - target-determination with: build-environment: linux-focal-cuda11.8-py3.10-gcc9-debug docker-image: ${{ needs.linux-focal-cuda11_8-py3_10-gcc9-debug-build.outputs.docker-image }} @@ -108,7 +129,9 @@ jobs: win-vs2019-cuda11_8-py3-test: name: win-vs2019-cuda11.8-py3 uses: ./.github/workflows/_win-test.yml - needs: win-vs2019-cuda11_8-py3-build + needs: + - win-vs2019-cuda11_8-py3-build + - target-determination with: build-environment: win-vs2019-cuda11.8-py3 cuda-version: "11.8" @@ -194,11 +217,11 @@ jobs: docker-image: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-vulkan-focal-py3_11-clang10-build.outputs.test-matrix }} - linux-focal-rocm5_7-py3_8-build: - name: linux-focal-rocm5.7-py3.8 + linux-focal-rocm6_1-py3_8-build: + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.7-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 test-matrix: | { include: [ @@ -206,11 +229,16 @@ jobs: { config: "distributed", shard: 2, num_shards: 2, runner: "linux.rocm.gpu" }, ]} - linux-focal-rocm5_7-py3_8-test: - name: linux-focal-rocm5.7-py3.8 + linux-focal-rocm6_1-py3_8-test: + permissions: + id-token: write + contents: read + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_7-py3_8-build + needs: + - linux-focal-rocm6_1-py3_8-build + - target-determination with: - build-environment: linux-focal-rocm5.7-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }} + build-environment: linux-focal-rocm6.1-py3.8 + docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }} diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml index 4c4b33a38868f..0ca9e0d33c8f9 100644 --- a/.github/workflows/pull.yml +++ b/.github/workflows/pull.yml @@ -17,10 +17,27 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +permissions: read-all + jobs: + llm-td: + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + linux-jammy-py3_8-gcc11-build: name: linux-jammy-py3.8-gcc11 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.8-gcc11 docker-image-name: pytorch-linux-jammy-py3.8-gcc11 @@ -39,7 +56,9 @@ jobs: linux-jammy-py3_8-gcc11-test: name: linux-jammy-py3.8-gcc11 uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_8-gcc11-build + needs: + - linux-jammy-py3_8-gcc11-build + - target-determination with: build-environment: linux-jammy-py3.8-gcc11 docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }} @@ -55,7 +74,7 @@ jobs: linux-jammy-py3_8-gcc11-no-ops: name: linux-jammy-py3.8-gcc11-no-ops - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.8-gcc11-no-ops docker-image-name: pytorch-linux-jammy-py3.8-gcc11 @@ -66,7 +85,7 @@ jobs: linux-jammy-py3_8-gcc11-pch: name: linux-jammy-py3.8-gcc11-pch - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.8-gcc11-pch docker-image-name: pytorch-linux-jammy-py3.8-gcc11 @@ -75,9 +94,10 @@ jobs: { config: "default", shard: 1, num_shards: 1 }, ]} + linux-jammy-py3_10-clang15-asan-build: name: linux-jammy-py3.10-clang15-asan - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.10-clang15-asan docker-image-name: pytorch-linux-jammy-py3-clang15-asan @@ -92,10 +112,13 @@ jobs: ]} sync-tag: asan-build + linux-jammy-py3_10-clang15-asan-test: name: linux-jammy-py3.10-clang15-asan uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_10-clang15-asan-build + needs: + - linux-jammy-py3_10-clang15-asan-build + - target-determination with: build-environment: linux-jammy-py3.10-clang15-asan docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }} @@ -104,7 +127,7 @@ jobs: linux-focal-py3_8-clang10-onnx-build: name: linux-focal-py3.8-clang10-onnx - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-py3.8-clang10-onnx docker-image-name: pytorch-linux-focal-py3-clang10-onnx @@ -117,7 +140,9 @@ jobs: linux-focal-py3_8-clang10-onnx-test: name: linux-focal-py3.8-clang10-onnx uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_8-clang10-onnx-build + needs: + - linux-focal-py3_8-clang10-onnx-build + - target-determination with: build-environment: linux-focal-py3.8-clang10-onnx docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }} @@ -125,7 +150,7 @@ jobs: linux-focal-py3_8-clang10-build: name: linux-focal-py3.8-clang10 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-py3.8-clang10 docker-image-name: pytorch-linux-focal-py3.8-clang10 @@ -136,19 +161,16 @@ jobs: { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 1, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 2, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 3, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 4, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 5, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 6, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 7, num_shards: 7, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, ]} - linux-focal-py3_8-clang10-test: name: linux-focal-py3.8-clang10 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_8-clang10-build + needs: + - linux-focal-py3_8-clang10-build + - target-determination with: build-environment: linux-focal-py3.8-clang10 docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }} @@ -156,7 +178,7 @@ jobs: linux-focal-py3_11-clang10-build: name: linux-focal-py3.11-clang10 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-py3.11-clang10 docker-image-name: pytorch-linux-focal-py3.11-clang10 @@ -167,27 +189,52 @@ jobs: { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, { config: "crossref", shard: 1, num_shards: 2, runner: "linux.2xlarge" }, { config: "crossref", shard: 2, num_shards: 2, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 1, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 2, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 3, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 4, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 5, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 6, num_shards: 7, runner: "linux.2xlarge" }, - { config: "dynamo", shard: 7, num_shards: 7, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, ]} + linux-focal-py3_11-clang10-test: name: linux-focal-py3.11-clang10 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_11-clang10-build + needs: + - linux-focal-py3_11-clang10-build + - target-determination with: build-environment: linux-focal-py3.11-clang10 docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }} + linux-focal-py3_12-clang10-build: + name: linux-focal-py3.12-clang10 + uses: ./.github/workflows/_linux-build-label.yml + with: + build-environment: linux-focal-py3.12-clang10 + docker-image-name: pytorch-linux-focal-py3.12-clang10 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 3, runner: "linux.2xlarge" }, + { config: "default", shard: 2, num_shards: 3, runner: "linux.2xlarge" }, + { config: "default", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 1, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 2, num_shards: 3, runner: "linux.2xlarge" }, + { config: "dynamo", shard: 3, num_shards: 3, runner: "linux.2xlarge" }, + ]} + + linux-focal-py3_12-clang10-test: + name: linux-focal-py3.12-clang10 + uses: ./.github/workflows/_linux-test.yml + needs: linux-focal-py3_12-clang10-build + with: + build-environment: linux-focal-py3.12-clang10 + docker-image: ${{ needs.linux-focal-py3_12-clang10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-py3_12-clang10-build.outputs.test-matrix }} + timeout-minutes: 600 + linux-focal-cuda11_8-py3_10-gcc9-build: name: linux-focal-cuda11.8-py3.10-gcc9 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-cuda11.8-py3.10-gcc9 docker-image-name: pytorch-linux-focal-cuda11.8-cudnn8-py3-gcc9 @@ -201,7 +248,9 @@ jobs: linux-focal-cuda11_8-py3_10-gcc9-test: name: linux-focal-cuda11.8-py3.10-gcc9 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda11_8-py3_10-gcc9-build + needs: + - linux-focal-cuda11_8-py3_10-gcc9-build + - target-determination with: timeout-minutes: 360 build-environment: linux-focal-cuda11.8-py3.10-gcc9 @@ -210,7 +259,7 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-build: name: linux-focal-cuda12.1-py3.10-gcc9 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-cuda12.1-py3.10-gcc9 docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9 @@ -227,7 +276,9 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-test: name: linux-focal-cuda12.1-py3.10-gcc9 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda12_1-py3_10-gcc9-build + needs: + - linux-focal-cuda12_1-py3_10-gcc9-build + - target-determination with: timeout-minutes: 360 build-environment: linux-focal-cuda12.1-py3.10-gcc9 @@ -236,10 +287,10 @@ jobs: linux-jammy-py3-clang12-mobile-build: name: linux-jammy-py3-clang12-mobile-build - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3-clang12-mobile-build - docker-image-name: pytorch-linux-jammy-py3-clang12-asan + docker-image-name: pytorch-linux-jammy-py3-clang15-asan build-generates-artifacts: false test-matrix: | { include: [ @@ -248,7 +299,7 @@ jobs: linux-jammy-cuda-11_8-cudnn8-py3_8-clang12-build: name: linux-jammy-cuda11.8-cudnn8-py3.8-clang12 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-cuda11.8-cudnn8-py3.8-clang12 docker-image-name: pytorch-linux-jammy-cuda11.8-cudnn8-py3.8-clang12 @@ -259,7 +310,7 @@ jobs: linux-focal-py3-clang9-mobile-custom-build-static: name: linux-focal-py3-clang9-mobile-custom-build-static - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-py3-clang9-mobile-custom-build-static docker-image-name: pytorch-linux-focal-py3-clang9-android-ndk-r21e @@ -271,9 +322,9 @@ jobs: linux-focal-py3_8-clang9-xla-build: name: linux-focal-py3_8-clang9-xla - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: - build-environment: linux-focal-py3_8-clang9-xla + build-environment: linux-focal-py3.8-clang9-xla docker-image-name: 308535385114.dkr.ecr.us-east-1.amazonaws.com/pytorch/xla_base:v1.1-lite test-matrix: | { include: [ @@ -285,7 +336,7 @@ jobs: uses: ./.github/workflows/_linux-test.yml needs: linux-focal-py3_8-clang9-xla-build with: - build-environment: linux-focal-py3_8-clang9-xla + build-environment: linux-focal-py3.8-clang9-xla docker-image: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-py3_8-clang9-xla-build.outputs.test-matrix }} @@ -353,7 +404,7 @@ jobs: linux-jammy-py3_8-gcc11-mobile-lightweight-dispatch-build: name: linux-jammy-py3.8-gcc11-mobile-lightweight-dispatch-build - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.8-gcc111-mobile-lightweight-dispatch-build docker-image-name: pytorch-linux-jammy-py3.8-gcc11 @@ -363,13 +414,13 @@ jobs: { config: "default", shard: 1, num_shards: 1 }, ]} - linux-focal-rocm5_7-py3_8-build: + linux-focal-rocm6_1-py3_8-build: # don't run build twice on main if: github.event_name == 'pull_request' - name: linux-focal-rocm5.7-py3.8 - uses: ./.github/workflows/_linux-build.yml + name: linux-focal-rocm6.1-py3.8 + uses: ./.github/workflows/_linux-build-label.yml with: - build-environment: linux-focal-rocm5.7-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 sync-tag: rocm-build test-matrix: | @@ -381,7 +432,7 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-sm86-build: name: linux-focal-cuda12.1-py3.10-gcc9-sm86 - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86 docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9 @@ -398,7 +449,9 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-sm86-test: name: linux-focal-cuda12.1-py3.10-gcc9-sm86 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda12_1-py3_10-gcc9-sm86-build + needs: + - linux-focal-cuda12_1-py3_10-gcc9-sm86-build + - target-determination with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }} @@ -406,7 +459,7 @@ jobs: linux-jammy-py3-clang12-executorch-build: name: linux-jammy-py3-clang12-executorch - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3-clang12-executorch docker-image-name: pytorch-linux-jammy-py3-clang12-executorch diff --git a/.github/workflows/rocm.yml b/.github/workflows/rocm.yml index 856d13a33a4c3..c32abe592bef2 100644 --- a/.github/workflows/rocm.yml +++ b/.github/workflows/rocm.yml @@ -15,12 +15,21 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +permissions: read-all + jobs: - linux-focal-rocm5_7-py3_8-build: - name: linux-focal-rocm5.7-py3.8 - uses: ./.github/workflows/_linux-build.yml + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + permissions: + id-token: write + contents: read + + linux-focal-rocm6_1-py3_8-build: + name: linux-focal-rocm6.1-py3.8 + uses: ./.github/workflows/_linux-build-label.yml with: - build-environment: linux-focal-rocm5.7-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 sync-tag: rocm-build test-matrix: | @@ -33,11 +42,16 @@ jobs: { config: "default", shard: 6, num_shards: 6, runner: "linux.rocm.gpu.2" }, ]} - linux-focal-rocm5_7-py3_8-test: - name: linux-focal-rocm5.7-py3.8 + linux-focal-rocm6_1-py3_8-test: + permissions: + id-token: write + contents: read + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_7-py3_8-build + needs: + - linux-focal-rocm6_1-py3_8-build + - target-determination with: - build-environment: linux-focal-rocm5.7-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }} + build-environment: linux-focal-rocm6.1-py3.8 + docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }} diff --git a/.github/workflows/slow.yml b/.github/workflows/slow.yml index a9b0c654b4cbc..31db7af8fc550 100644 --- a/.github/workflows/slow.yml +++ b/.github/workflows/slow.yml @@ -18,7 +18,24 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +permissions: read-all + jobs: + llm-td: + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build: name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck uses: ./.github/workflows/_linux-build.yml @@ -28,16 +45,20 @@ jobs: cuda-arch-list: 8.6 test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 2, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 3, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, - { config: "default", shard: 4, num_shards: 4, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 1, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.g5.4xlarge.nvidia.gpu" }, ]} linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-test: name: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build + needs: + - linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build + - target-determination with: build-environment: linux-focal-cuda12.1-py3-gcc9-slow-gradcheck docker-image: ${{ needs.linux-focal-cuda12_1-py3-gcc9-slow-gradcheck-build.outputs.docker-image }} @@ -60,7 +81,9 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-sm86-test: name: linux-focal-cuda12.1-py3.10-gcc9-sm86 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda12_1-py3_10-gcc9-sm86-build + needs: + - linux-focal-cuda12_1-py3_10-gcc9-sm86-build + - target-determination with: build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm86 docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-sm86-build.outputs.docker-image }} @@ -80,49 +103,59 @@ jobs: linux-focal-py3_8-clang10-test: name: linux-focal-py3.8-clang10 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-py3_8-clang10-build + needs: + - linux-focal-py3_8-clang10-build + - target-determination with: build-environment: linux-focal-py3.8-clang10 docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }} test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }} - linux-focal-rocm5_6-py3_8-build: - name: linux-focal-rocm5.6-py3.8 + linux-focal-rocm6_1-py3_8-build: + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_linux-build.yml with: - build-environment: linux-focal-rocm5.6-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 test-matrix: | { include: [ { config: "slow", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, ]} - linux-focal-rocm5_6-py3_8-test: - name: linux-focal-rocm5.6-py3.8 + linux-focal-rocm6_1-py3_8-test: + permissions: + id-token: write + contents: read + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_6-py3_8-build + needs: + - linux-focal-rocm6_1-py3_8-build + - target-determination with: - build-environment: linux-focal-rocm5.6-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_6-py3_8-build.outputs.test-matrix }} + build-environment: linux-focal-rocm6.1-py3.8 + docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }} linux-jammy-py3_10-clang15-asan-build: name: linux-jammy-py3.10-clang15-asan - uses: ./.github/workflows/_linux-build.yml + uses: ./.github/workflows/_linux-build-label.yml with: build-environment: linux-jammy-py3.10-clang15-asan docker-image-name: pytorch-linux-jammy-py3-clang15-asan test-matrix: | { include: [ - { config: "slow", shard: 1, num_shards: 2, runner: "linux.4xlarge" }, - { config: "slow", shard: 2, num_shards: 2, runner: "linux.4xlarge" }, + { config: "slow", shard: 1, num_shards: 3, runner: "linux.4xlarge" }, + { config: "slow", shard: 2, num_shards: 3, runner: "linux.4xlarge" }, + { config: "slow", shard: 3, num_shards: 3, runner: "linux.4xlarge" }, ]} sync-tag: asan-build linux-jammy-py3_10-clang15-asan-test: name: linux-jammy-py3.10-clang15-asan uses: ./.github/workflows/_linux-test.yml - needs: linux-jammy-py3_10-clang15-asan-build + needs: + - linux-jammy-py3_10-clang15-asan-build + - target-determination with: build-environment: linux-jammy-py3.10-clang15-asan docker-image: ${{ needs.linux-jammy-py3_10-clang15-asan-build.outputs.docker-image }} diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml index 39fe67da05d5a..56e349dfa1b82 100644 --- a/.github/workflows/stale.yml +++ b/.github/workflows/stale.yml @@ -22,6 +22,9 @@ jobs: stale: if: ${{ github.repository == 'pytorch/pytorch' }} runs-on: linux.large.arc + permissions: + contents: read + pull-requests: write steps: - uses: actions/github-script@v6 diff --git a/.github/workflows/target-determination-indexer.yml b/.github/workflows/target-determination-indexer.yml new file mode 100644 index 0000000000000..0ce1bae6a4138 --- /dev/null +++ b/.github/workflows/target-determination-indexer.yml @@ -0,0 +1,144 @@ +name: Index PyTorch Tests for Target Determination + +on: + workflow_dispatch: + schedule: + - cron: '0 0 * * *' + +permissions: + id-token: write + contents: read + +jobs: + index: + runs-on: linux.g5.4xlarge.nvidia.gpu # 1 GPU A10G 24GB each + environment: target-determinator-env + steps: + - name: Clone PyTorch + uses: actions/checkout@v3 + with: + path: pytorch + + - name: Setup Linux + uses: ./pytorch/.github/actions/setup-linux + + - name: Calculate docker image + id: calculate-docker-image + uses: pytorch/test-infra/.github/actions/calculate-docker-image@main + with: + docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9 + working-directory: pytorch + + - name: Use following to pull public copy of the image + id: print-ghcr-mirror + env: + ECR_DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + shell: bash + run: | + tag=${ECR_DOCKER_IMAGE##*/} + echo "docker pull ghcr.io/pytorch/ci-image:${tag/:/-}" + + - name: Pull docker image + uses: pytorch/test-infra/.github/actions/pull-docker-image@main + with: + docker-image: ${{ steps.calculate-docker-image.outputs.docker-image }} + + - name: Install nvidia driver, nvidia-docker runtime, set GPU_FLAG + id: install-nvidia-driver + uses: pytorch/test-infra/.github/actions/setup-nvidia@main + + - name: Clone CodeLlama + uses: actions/checkout@v3 + with: + repository: osalpekar/codellama + ref: 1ec50e0cfc0fadc3b6ceb146617e2119ab26eb34 + path: codellama + + - name: Clone Target Determination Code + uses: actions/checkout@v3 + with: + repository: osalpekar/llm-target-determinator + ref: v0.0.2 + path: llm-target-determinator + + - name: Configure AWS credentials + uses: aws-actions/configure-aws-credentials@v3 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_target_determinator_s3_read_write + aws-region: us-east-1 + + - name: Download checkpoint + shell: bash + env: + AWS_DEFAULT_REGION: us-east-1 + run: | + # Do this outside of docker so I don't have to put env vars in + pip3 install awscli==1.29.40 + cd codellama + mkdir "CodeLlama-7b-Python" + aws s3 cp \ + "s3://target-determinator-assets/CodeLlama-7b-Python" \ + "CodeLlama-7b-Python" \ + --recursive + + - name: Run indexer + shell: bash -l {0} + env: + DOCKER_IMAGE: ${{ steps.calculate-docker-image.outputs.docker-image }} + GITHUB_RUN_ID: ${{ github.run_id }} + AWS_DEFAULT_REGION: us-east-1 + run: | + # detached container should get cleaned up by teardown_ec2_linux + container_name=$(docker run \ + ${GPU_FLAG:-} \ + -e MAX_JOBS="$(nproc --ignore=2)" \ + -e AWS_DEFAULT_REGION \ + --env-file="/tmp/github_env_${GITHUB_RUN_ID}" \ + --security-opt seccomp=unconfined \ + --cap-add=SYS_PTRACE \ + --tty \ + --detach \ + --user jenkins \ + -v "${GITHUB_WORKSPACE}:/var/lib/jenkins/workspace" \ + -w /var/lib/jenkins/workspace \ + "${DOCKER_IMAGE}" + ) + chmod +x pytorch/.github/scripts/td_llm_indexer.sh + docker exec -t "${container_name}" sh -c 'pytorch/.github/scripts/td_llm_indexer.sh' + + - name: Upload to s3 + shell: bash -l {0} + env: + AWS_DEFAULT_REGION: us-east-1 + run: | + cd llm-target-determinator/assets + + TIMESTAMP=$(date -Iseconds) + ZIP_NAME="indexer-files-${TIMESTAMP}.zip" + + # Create a zipfile with all the generated indices + zip -r "${ZIP_NAME}" indexer-files + + # Note that because the below 2 operations are not atomic, there will + # be a period of a few seconds between these where there is no index + # present in the latest/ folder. To account for this, the retriever + # should have some retry logic with backoff to ensure fetching the + # index doesn't fail. + # Move the old index into the archived/ folder + aws s3 mv \ + "s3://target-determinator-assets/indexes/latest" \ + "s3://target-determinator-assets/indexes/archived" \ + --recursive + + # Move the new index into the latestl/ folder + aws s3 cp \ + "${ZIP_NAME}" \ + "s3://target-determinator-assets/indexes/latest/${ZIP_NAME}" + + - name: Teardown Linux + uses: pytorch/test-infra/.github/actions/teardown-linux@main + if: always() + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.sha }}-${{ github.event_name == 'workflow_dispatch' }} + cancel-in-progress: true diff --git a/.github/workflows/target_determination.yml b/.github/workflows/target_determination.yml new file mode 100644 index 0000000000000..cd5e758345b59 --- /dev/null +++ b/.github/workflows/target_determination.yml @@ -0,0 +1,81 @@ +name: target-determination + +on: + workflow_call: + +jobs: + target-determination: + # Don't run on forked repos + if: github.repository_owner == 'pytorch' + runs-on: linux.2xlarge + steps: + # [pytorch repo ref] + # Use a pytorch/pytorch reference instead of a reference to the local + # checkout because when we run this action we don't *have* a local + # checkout. In other cases you should prefer a local checkout. + - name: Checkout PyTorch + uses: pytorch/pytorch/.github/actions/checkout-pytorch@main + with: + submodules: false + + - name: Setup Linux + uses: ./.github/actions/setup-linux + + - name: Get workflow job id + id: get-job-id + uses: ./.github/actions/get-workflow-job-id + if: always() + with: + github-token: ${{ secrets.GITHUB_TOKEN }} + + - name: Download pytest cache + uses: ./.github/actions/pytest-cache-download + continue-on-error: true + with: + cache_dir: .pytest_cache + job_identifier: ${{ github.workflow }} + + - name: Download LLM Artifacts from S3 + uses: seemethere/download-artifact-s3@v4 + continue-on-error: true + with: + name: llm_results + path: .additional_ci_files/llm_results + + - name: Do TD + id: td + continue-on-error: true + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_REPOSITORY: ${{ github.repository }} + GITHUB_WORKFLOW: ${{ github.workflow }} + GITHUB_JOB: ${{ github.job }} + GITHUB_RUN_ID: ${{ github.run_id }} + GITHUB_RUN_NUMBER: ${{ github.run_number }} + GITHUB_RUN_ATTEMPT: ${{ github.run_attempt }} + GITHUB_REF: ${{ github.ref }} + JOB_ID: ${{ steps.get-job-id.outputs.job-id }} + JOB_NAME: ${{ steps.get-job-id.outputs.job-name }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: | + unzip -o .additional_ci_files/llm_results/mappings.zip -d .additional_ci_files/llm_results || true + python3 -m pip install boto3==1.19.12 + python3 tools/testing/do_target_determination_for_s3.py + + - name: Upload TD results to s3 + uses: seemethere/upload-artifact-s3@v5 + if: steps.td.outcome == 'success' + with: + name: td_results + retention-days: 14 + if-no-files-found: error + path: td_results.json + + - name: Store TD results on GHA + uses: actions/upload-artifact@v3 + if: steps.td.outcome == 'success' + with: + name: td_results.json + retention-days: 14 + if-no-files-found: error + path: td_results.json diff --git a/.github/workflows/torchbench.yml b/.github/workflows/torchbench.yml new file mode 100644 index 0000000000000..73befe34c0782 --- /dev/null +++ b/.github/workflows/torchbench.yml @@ -0,0 +1,38 @@ +name: torchbench + +on: + push: + tags: + - ciflow/torchbench/* + workflow_dispatch: + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} + cancel-in-progress: true + +jobs: + linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp: + name: cuda12.1-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-build.yml + with: + build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 + docker-image-name: pytorch-linux-focal-cuda12.1-cudnn8-py3-gcc9-inductor-benchmarks + cuda-arch-list: '8.0' + test-matrix: | + { include: [ + { config: "torchbench_gcp_smoketest", shard: 1, num_shards: 1, runner: "linux.gcp.a100" }, + ]} + secrets: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} + + linux-focal-cuda12_1-py3_10-gcc9-torchbench-test-gcp: + name: cuda12.1-py3.10-gcc9-sm80 + uses: ./.github/workflows/_linux-test.yml + needs: linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp + with: + build-environment: linux-focal-cuda12.1-py3.10-gcc9-sm80 + docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-torchbench-build-gcp.outputs.test-matrix }} + use-gha: anything-non-empty-to-use-gha + secrets: + HUGGING_FACE_HUB_TOKEN: ${{ secrets.HUGGING_FACE_HUB_TOKEN }} diff --git a/.github/workflows/trunk.yml b/.github/workflows/trunk.yml index 5ded7ac152cf1..a990ad6941db1 100644 --- a/.github/workflows/trunk.yml +++ b/.github/workflows/trunk.yml @@ -16,7 +16,24 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }} cancel-in-progress: true +permissions: read-all + jobs: + llm-td: + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + # Build PyTorch with BUILD_CAFFE2=ON caffe2-linux-jammy-py3_8-gcc11-build: name: caffe2-linux-jammy-py3.8-gcc11 @@ -45,7 +62,9 @@ jobs: linux-focal-cuda12_1-py3_10-gcc9-test: name: linux-focal-cuda12.1-py3.10-gcc9 uses: ./.github/workflows/_linux-test.yml - needs: linux-focal-cuda12_1-py3_10-gcc9-build + needs: + - linux-focal-cuda12_1-py3_10-gcc9-build + - target-determination with: build-environment: linux-focal-cuda12.1-py3.10-gcc9 docker-image: ${{ needs.linux-focal-cuda12_1-py3_10-gcc9-build.outputs.docker-image }} @@ -93,7 +112,7 @@ jobs: with: sync-tag: macos-12-py3-arm64-build build-environment: macos-12-py3-arm64 - runner-type: macos-m1-12 + runner-type: macos-m1-stable build-generates-artifacts: true # To match the one pre-installed in the m1 runners python-version: 3.9.12 @@ -103,9 +122,9 @@ jobs: environment-file: .github/requirements/conda-env-macOS-ARM64 test-matrix: | { include: [ - { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-12" }, - { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-12" }, - { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-12" }, + { config: "default", shard: 1, num_shards: 3, runner: "macos-m1-stable" }, + { config: "default", shard: 2, num_shards: 3, runner: "macos-m1-stable" }, + { config: "default", shard: 3, num_shards: 3, runner: "macos-m1-stable" }, ]} macos-12-py3-arm64-mps-test: @@ -120,13 +139,17 @@ jobs: python-version: 3.9.12 test-matrix: | { include: [ - { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-12" }, + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-stable" }, + { config: "mps", shard: 1, num_shards: 1, runner: "macos-m1-14" }, + ]} macos-12-py3-arm64-test: name: macos-12-py3-arm64 uses: ./.github/workflows/_mac-test.yml - needs: macos-12-py3-arm64-build + needs: + - macos-12-py3-arm64-build + - target-determination with: build-environment: macos-12-py3-arm64 # Same as the build job @@ -151,7 +174,9 @@ jobs: win-vs2019-cpu-py3-test: name: win-vs2019-cpu-py3 uses: ./.github/workflows/_win-test.yml - needs: win-vs2019-cpu-py3-build + needs: + - win-vs2019-cpu-py3-build + - target-determination with: build-environment: win-vs2019-cpu-py3 cuda-version: cpu @@ -175,11 +200,11 @@ jobs: { config: "force_on_cpu", shard: 1, num_shards: 1, runner: "windows.4xlarge.nonephemeral" }, ]} - linux-focal-rocm5_7-py3_8-build: - name: linux-focal-rocm5.7-py3.8 - uses: ./.github/workflows/_linux-build.yml + linux-focal-rocm6_1-py3_8-build: + name: linux-focal-rocm6.1-py3.8 + uses: ./.github/workflows/_linux-build-label.yml with: - build-environment: linux-focal-rocm5.7-py3.8 + build-environment: linux-focal-rocm6.1-py3.8 docker-image-name: pytorch-linux-focal-rocm-n-py3 sync-tag: rocm-build test-matrix: | @@ -187,12 +212,17 @@ jobs: { config: "default", shard: 1, num_shards: 1, runner: "linux.rocm.gpu" }, ]} - linux-focal-rocm5_7-py3_8-test: - name: linux-focal-rocm5.7-py3.8 + linux-focal-rocm6_1-py3_8-test: + permissions: + id-token: write + contents: read + name: linux-focal-rocm6.1-py3.8 uses: ./.github/workflows/_rocm-test.yml - needs: linux-focal-rocm5_7-py3_8-build + needs: + - linux-focal-rocm6_1-py3_8-build + - target-determination with: - build-environment: linux-focal-rocm5.7-py3.8 - docker-image: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.docker-image }} - test-matrix: ${{ needs.linux-focal-rocm5_7-py3_8-build.outputs.test-matrix }} + build-environment: linux-focal-rocm6.1-py3.8 + docker-image: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-rocm6_1-py3_8-build.outputs.test-matrix }} tests-to-include: "test_nn test_torch test_cuda test_ops test_unary_ufuncs test_binary_ufuncs test_autograd inductor/test_torchinductor" diff --git a/.github/workflows/unstable-periodic.yml b/.github/workflows/unstable-periodic.yml index df422752f7e3e..9a41bbd44f268 100644 --- a/.github/workflows/unstable-periodic.yml +++ b/.github/workflows/unstable-periodic.yml @@ -13,6 +13,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }}-${{ github.event_name == 'schedule' }}-${{ github.event.schedule }} cancel-in-progress: true +permissions: read-all + jobs: # There must be at least one job here to satisfy GitHub action workflow syntax introduction: diff --git a/.github/workflows/unstable.yml b/.github/workflows/unstable.yml index 7a803b54ef8bb..ac1d49d1cce57 100644 --- a/.github/workflows/unstable.yml +++ b/.github/workflows/unstable.yml @@ -12,6 +12,8 @@ concurrency: group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref_name }}-${{ github.ref_type == 'branch' && github.sha }}-${{ github.event_name == 'workflow_dispatch' }} cancel-in-progress: true +permissions: read-all + jobs: # There must be at least one job here to satisfy GitHub action workflow syntax introduction: @@ -30,3 +32,174 @@ jobs: echo echo "Once the jobs are deemed stable enough (% red signal < 5% and TTS < 3h)," echo " they can graduate and move back to pull or trunk." + + # + # Experimental ARC jobs + # + llm-td: + name: before-test + uses: ./.github/workflows/llm_td_retrieval.yml + permissions: + id-token: write + contents: read + + target-determination: + name: before-test + uses: ./.github/workflows/target_determination.yml + needs: llm-td + permissions: + id-token: write + contents: read + + linux-jammy-py3_8-gcc11-build: + name: linux-jammy-py3.8-gcc11 + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-jammy-py3.8-gcc11 + docker-image-name: pytorch-linux-jammy-py3.8-gcc11 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "docs_test", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "jit_legacy", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "backwards_compat", shard: 1, num_shards: 1, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "distributed", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "distributed", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + ]} + + linux-jammy-py3_8-gcc11-test: + name: linux-jammy-py3.8-gcc11 + uses: ./.github/workflows/_linux-test-rg.yml + needs: + - linux-jammy-py3_8-gcc11-build + - target-determination + with: + build-environment: linux-jammy-py3.8-gcc11 + docker-image: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-jammy-py3_8-gcc11-build.outputs.test-matrix }} + + linux-jammy-py3_8-gcc11-no-ops: + name: linux-jammy-py3.8-gcc11-no-ops + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-jammy-py3.8-gcc11-no-ops + docker-image-name: pytorch-linux-jammy-py3.8-gcc11 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 1 }, + ]} + + linux-jammy-py3_8-gcc11-pch: + name: linux-jammy-py3.8-gcc11-pch + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-jammy-py3.8-gcc11-pch + docker-image-name: pytorch-linux-jammy-py3.8-gcc11 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 1 }, + ]} + + linux-focal-py3_8-clang10-onnx-build: + name: linux-focal-py3.8-clang10-onnx + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-focal-py3.8-clang10-onnx + docker-image-name: pytorch-linux-focal-py3-clang10-onnx + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + ]} + + linux-focal-py3_8-clang10-onnx-test: + name: linux-focal-py3.8-clang10-onnx + uses: ./.github/workflows/_linux-test-rg.yml + needs: + - linux-focal-py3_8-clang10-onnx-build + - target-determination + with: + build-environment: linux-focal-py3.8-clang10-onnx + docker-image: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-py3_8-clang10-onnx-build.outputs.test-matrix }} + + linux-jammy-py3_10-clang15-asan-build: + name: linux-jammy-py3.10-clang15-asan + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-jammy-py3.10-clang15-asan + docker-image-name: pytorch-linux-jammy-py3-clang15-asan + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 6, runner: "linux.4xlarge" }, + { config: "default", shard: 2, num_shards: 6, runner: "linux.4xlarge" }, + { config: "default", shard: 3, num_shards: 6, runner: "linux.4xlarge" }, + { config: "default", shard: 4, num_shards: 6, runner: "linux.4xlarge" }, + { config: "default", shard: 5, num_shards: 6, runner: "linux.4xlarge" }, + { config: "default", shard: 6, num_shards: 6, runner: "linux.4xlarge" }, + ]} + sync-tag: asan-build-arc + + linux-focal-py3_8-clang10-build: + name: linux-focal-py3.8-clang10 + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-focal-py3.8-clang10 + docker-image-name: pytorch-linux-focal-py3.8-clang10 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + ]} + + linux-focal-py3_8-clang10-test: + name: linux-focal-py3.8-clang10 + uses: ./.github/workflows/_linux-test-rg.yml + needs: + - linux-focal-py3_8-clang10-build + - target-determination + with: + build-environment: linux-focal-py3.8-clang10 + docker-image: ${{ needs.linux-focal-py3_8-clang10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-py3_8-clang10-build.outputs.test-matrix }} + + linux-focal-py3_11-clang10-build: + name: linux-focal-py3.11-clang10 + uses: ./.github/workflows/_linux-build-rg.yml + with: + build-environment: linux-focal-py3.11-clang10 + docker-image-name: pytorch-linux-focal-py3.11-clang10 + test-matrix: | + { include: [ + { config: "default", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "default", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "crossref", shard: 1, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "crossref", shard: 2, num_shards: 2, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 1, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 2, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + { config: "dynamo", shard: 3, num_shards: 3, runner: "arc-lf-linux.2xlarge.avx512" }, + ]} + + linux-focal-py3_11-clang10-test: + name: linux-focal-py3.11-clang10 + uses: ./.github/workflows/_linux-test-rg.yml + needs: + - linux-focal-py3_11-clang10-build + - target-determination + with: + build-environment: linux-focal-py3.11-clang10 + docker-image: ${{ needs.linux-focal-py3_11-clang10-build.outputs.docker-image }} + test-matrix: ${{ needs.linux-focal-py3_11-clang10-build.outputs.test-matrix }} + + # + # End of Experimental ARC jobs + # diff --git a/.github/workflows/update-viablestrict.yml b/.github/workflows/update-viablestrict.yml index 7b3d42f70ee8a..94a712b377484 100644 --- a/.github/workflows/update-viablestrict.yml +++ b/.github/workflows/update-viablestrict.yml @@ -13,46 +13,13 @@ jobs: do_update_viablestrict: if: ${{ github.repository_owner == 'pytorch' }} runs-on: ubuntu-20.04 - environment: mergebot + environment: ${{ (github.event_name == 'schedule') && 'mergebot' || '' }} steps: - - name: Checkout repo - uses: actions/checkout@v3 + - name: Update viable/strict + uses: pytorch/test-infra/.github/actions/update-viablestrict@main with: - fetch-depth: 0 - token: ${{ secrets.MERGEBOT_TOKEN }} - - - name: Setup Python - uses: actions/setup-python@v4 - with: - python-version: '3.8' - architecture: x64 - check-latest: false - cache: pip - cache-dependency-path: | - **/.ci/docker/requirements-ci.txt - **/.github/requirements-gha-cache.txt - - - name: Install Python Packages - run: | - pip3 install rockset==1.0.3 - pip3 install boto3==1.19.12 - - - name: Get latest viable commit - env: - ROCKSET_API_KEY: ${{ secrets.ROCKSET_API_KEY }} - run: | - output=$(python3 .github/scripts/fetch_latest_green_commit.py) - echo "latest_viable_sha=$output" >> "${GITHUB_OUTPUT}" - id: get-latest-commit - - - name: Push SHA to viable/strict branch - if: steps.get-latest-commit.outputs.latest_viable_sha != 'None' - env: - GITHUB_TOKEN: ${{ secrets.MERGEBOT_TOKEN }} - run: | - git config --global user.email "pytorchmergebot@users.noreply.github.com" - git config --global user.name "PyTorch MergeBot" - echo "Set the latest sha variable to be ${{ steps.get-latest-commit.outputs.latest_viable_sha }}" - # Pushing an older green commit here will fail because it's non-fast-forward, which is ok - # to ignore because we already have the later green commit in visable/strict - git push origin "${{ steps.get-latest-commit.outputs.latest_viable_sha }}":viable/strict || true + repository: pytorch/pytorch + stable-branch: viable/strict + requires: '[\"pull\", \"trunk\", \"lint\", \"linux-binary\"]' + secret-bot-token: ${{ secrets.MERGEBOT_TOKEN }} + rockset-api-key: ${{ secrets.ROCKSET_API_KEY }} diff --git a/.github/workflows/update_pytorch_labels.yml b/.github/workflows/update_pytorch_labels.yml index 81dc264cbcd67..db09474fb2120 100644 --- a/.github/workflows/update_pytorch_labels.yml +++ b/.github/workflows/update_pytorch_labels.yml @@ -12,16 +12,22 @@ jobs: update-labels-in-S3: runs-on: ubuntu-22.04 if: ${{ github.repository == 'pytorch/pytorch' }} + permissions: + id-token: write + contents: read steps: - name: Checkout PyTorch uses: pytorch/pytorch/.github/actions/checkout-pytorch@main with: fetch-depth: 1 submodules: false + - name: configure aws credentials + id: aws_creds + uses: aws-actions/configure-aws-credentials@v4 + with: + role-to-assume: arn:aws:iam::308535385114:role/gha_workflow_update_pytorch_labels + aws-region: us-east-1 - name: Update PyTorch labels list in S3 - env: - AWS_ACCESS_KEY_ID: ${{ secrets.AWS_OSSCI_METRICS_V2_ACCESS_KEY_ID }} - AWS_SECRET_ACCESS_KEY: ${{ secrets.AWS_OSSCI_METRICS_V2_SECRET_ACCESS_KEY }} run: | python3 -m pip install boto3==1.19.12 .github/scripts/export_pytorch_labels.py pytorch pytorch diff --git a/.github/workflows/weekly.yml b/.github/workflows/weekly.yml index 836d25cf5f280..f097b146c21f8 100644 --- a/.github/workflows/weekly.yml +++ b/.github/workflows/weekly.yml @@ -8,6 +8,8 @@ on: - cron: 37 7 * * 1 workflow_dispatch: +permissions: read-all + jobs: update-commit-hash: runs-on: ubuntu-latest @@ -19,18 +21,21 @@ jobs: fetch-depth: 0 - name: update-xla-commit-hash continue-on-error: true - uses: ./.github/actions/update-commit-hash + uses: pytorch/test-infra/.github/actions/update-commit-hash@main with: repo-name: xla branch: master + pin-folder: .github/ci_commit_pins + test-infra-ref: main updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }} pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} - name: update-triton-commit-hash - uses: ./.github/actions/update-commit-hash + uses: pytorch/test-infra/.github/actions/update-commit-hash@main with: repo-owner: openai repo-name: triton branch: main pin-folder: .ci/docker/ci_commit_pins + test-infra-ref: main updatebot-token: ${{ secrets.UPDATEBOT_TOKEN }} pytorchbot-token: ${{ secrets.GH_PYTORCHBOT_TOKEN }} diff --git a/.github/workflows/xpu.yml b/.github/workflows/xpu.yml index 6cd8909ba8007..b48a7c01cc3be 100644 --- a/.github/workflows/xpu.yml +++ b/.github/workflows/xpu.yml @@ -30,6 +30,9 @@ jobs: name: linux-jammy-xpu-py3.8 uses: ./.github/workflows/_xpu-test.yml needs: linux-jammy-xpu-py3_8-build + permissions: + id-token: write + contents: read with: build-environment: linux-jammy-xpu-py3.8 docker-image: ${{ needs.linux-jammy-xpu-py3_8-build.outputs.docker-image }} diff --git a/.gitignore b/.gitignore index 20019ecd170f8..bfb3013c6d191 100644 --- a/.gitignore +++ b/.gitignore @@ -54,6 +54,7 @@ test/.coverage test/.hypothesis/ test/cpp/api/mnist test/custom_operator/model.pt +test/debug/ test/jit_hooks/*.pt test/data/legacy_modules.t7 test/data/*.pt @@ -86,6 +87,7 @@ torch/csrc/api/include/torch/version.h torch/csrc/cudnn/cuDNN.cpp torch/csrc/generated torch/csrc/generic/TensorMethods.cpp +torch/csrc/inductor/aoti_torch/generated/*.cpp torch/csrc/jit/generated/* torch/csrc/jit/fuser/config.h torch/csrc/nn/THCUNN.cpp @@ -126,6 +128,7 @@ env .circleci/scripts/COMMIT_MSG scripts/release_notes/*.json sccache-stats*.json +lint.json # These files get copied over on invoking setup.py torchgen/packaged/* diff --git a/.gitmodules b/.gitmodules index 7e1b09e591cd5..c9b84a3701674 100644 --- a/.gitmodules +++ b/.gitmodules @@ -149,3 +149,6 @@ [submodule "third_party/mimalloc"] path = third_party/mimalloc url = https://github.com/microsoft/mimalloc.git +[submodule "third_party/opentelemetry-cpp"] + path = third_party/opentelemetry-cpp + url = https://github.com/open-telemetry/opentelemetry-cpp.git diff --git a/.isort.cfg b/.isort.cfg deleted file mode 100644 index d14d9bf207e6f..0000000000000 --- a/.isort.cfg +++ /dev/null @@ -1,6 +0,0 @@ -[settings] -include_trailing_comma=True -multi_line_output=3 -skip=third_party -skip_gitignore=True -use_parentheses=True diff --git a/.lintrunner.toml b/.lintrunner.toml index c1f4cb6b5b9b4..7f76a35415ca7 100644 --- a/.lintrunner.toml +++ b/.lintrunner.toml @@ -1,5 +1,3 @@ -merge_base_with = "origin/main" - [[linter]] code = 'FLAKE8' include_patterns = ['**/*.py'] @@ -48,7 +46,7 @@ init_command = [ 'mccabe==0.7.0', 'pycodestyle==2.11.1', 'pyflakes==3.1.0', - 'torchfix==0.2.0', + 'torchfix==0.4.0 ; python_version >= "3.9"', ] @@ -57,6 +55,8 @@ code = 'CLANGFORMAT' include_patterns = [ 'aten/src/ATen/*.h', 'aten/src/ATen/mps/**/*.mm', + 'aten/src/ATen/xpu/**/*.h', + 'aten/src/ATen/xpu/**/*.cpp', 'aten/src/ATen/native/mps/**/*.mm', 'aten/src/ATen/native/vulkan/**/*.h', 'aten/src/ATen/native/vulkan/**/*.cpp', @@ -64,6 +64,8 @@ include_patterns = [ 'aten/src/ATen/native/**/Foreach*.*', 'aten/src/ATen/native/cuda/fused*.*', 'aten/src/ATen/native/cuda/Fused*.cu', + 'aten/src/ATen/native/cudnn/*.h', + 'aten/src/ATen/native/cudnn/*.cpp', 'c10/**/*.h', 'c10/**/*.cpp', 'torch/csrc/**/*.h', @@ -76,6 +78,7 @@ exclude_patterns = [ 'aten/src/ATen/native/vulkan/api/vk_mem_alloc.h', 'c10/util/strong_type.h', '**/fb/**', + 'torch/csrc/inductor/aoti_torch/generated/**', 'torch/csrc/jit/serialization/mobile_bytecode_generated.h', 'torch/csrc/utils/pythoncapi_compat.h', 'aten/src/ATen/dlpack.h', @@ -118,39 +121,6 @@ include_patterns = [ ] exclude_patterns = [ '**/fb/**', - 'torch/include/**', - 'torch/csrc/**', - 'torch/_dynamo/**/*.py', - 'torch/_inductor/**/*.py', - 'torch/_numpy/**/*.py', - 'torch/_functorch/aot_autograd.py', - 'torch/_functorch/benchmark_utils.py', - 'torch/_functorch/compile_utils.py', - 'torch/_functorch/compilers.py', - 'torch/_functorch/eager_transforms.py', - 'torch/_functorch/fx_minifier.py', - 'torch/_functorch/partitioners.py', - 'torch/_functorch/top_operators_github_usage.py', - 'torch/_functorch/vmap.py', - 'torch/_subclasses/schema_check_mode.py', - 'torch/distributed/elastic/agent/server/api.py', - 'torch/testing/_internal/**', - 'torch/distributed/fsdp/fully_sharded_data_parallel.py', - # TODO(suo): these exclusions were added just to get lint clean on master. - # Follow up to do more target suppressions and remove them. - 'torch/ao/quantization/fx/convert.py', - 'torch/ao/quantization/_dbr/function_fusion.py', - 'test/test_datapipe.py', - 'caffe2/contrib/fakelowp/test/test_batchmatmul_nnpi_fp16.py', - 'test/test_numpy_interop.py', - 'torch/torch_version.py', - 'torch/fx/proxy.py', - 'torch/fx/passes/shape_prop.py', - 'torch/fx/node.py', - 'torch/fx/experimental/symbolic_shapes.py', - 'torch/fx/experimental/proxy_tensor.py', - 'torch/_subclasses/fake_utils.py', - 'torch/_subclasses/fake_tensor.py', ] command = [ 'python3', @@ -166,45 +136,20 @@ init_command = [ 'numpy==1.24.3 ; python_version == "3.8"', 'numpy==1.26.0 ; python_version >= "3.9"', 'expecttest==0.1.6', - 'mypy==1.7.0', + 'mypy==1.9.0', + 'sympy==1.11.1', 'types-requests==2.27.25', 'types-PyYAML==6.0.7', 'types-tabulate==0.8.8', 'types-protobuf==3.19.18', 'types-pkg-resources==0.1.3', 'types-Jinja2==2.11.9', + 'types-colorama==0.4.6', + 'filelock==3.13.1', 'junitparser==2.1.1', 'rich==10.9.0', - 'pyyaml==6.0', - 'optree==0.10.0', -] - -[[linter]] -code = 'MYPYINDUCTOR' -include_patterns = [ - 'torch/_dynamo/**/*.py', - 'torch/_inductor/**/*.py', -] -exclude_patterns = [ - '**/fb/**', - 'torch/_dynamo/backends/**/*.py', - 'torch/_dynamo/variables/**/*.py', - 'torch/_dynamo/polyfill.py', - 'torch/_inductor/fx_passes/serialized_patterns/**', -] -command = [ - 'python3', - 'tools/linter/adapters/mypy_linter.py', - '--config=mypy-inductor.ini', - '--code=MYPYINDUCTOR', - '--', - '@{{PATHSFILE}}' -] -init_command = [ - 'python3', - 'tools/linter/adapters/pip_init.py', - '--dry-run={{DRYRUN}}', - 'types-colorama==0.4.6', + 'pyyaml==6.0.1', + 'optree==0.11.0', ] [[linter]] @@ -242,10 +187,19 @@ command = [ [[linter]] code = 'CLANGTIDY' include_patterns = [ + # Enable coverage of headers in aten/src/ATen + # and excluding most sub-directories for now. + 'aten/src/ATen/*.h', + 'aten/src/ATen/*.cpp', + 'aten/src/ATen/core/*.h', 'aten/src/ATen/core/*.cpp', + 'aten/src/ATen/functorch/*.h', + 'aten/src/ATen/functorch/*.cpp', 'c10/**/*.cpp', - 'c10/core/**/*.h', - 'c10/util/**/*.h', + 'c10/**/*.h', + 'torch/csrc/*.h', + 'torch/csrc/*.cpp', + 'torch/csrc/**/*.h', 'torch/csrc/**/*.cpp', ] exclude_patterns = [ @@ -254,8 +208,10 @@ exclude_patterns = [ # CUDA files are also excluded. '**/fb/**', '**/*pb.h', - '**/*CUDA*', - '**/cuda/*pp', + 'aten/**/cuda/*pp', + 'c10/xpu/**/*.h', + 'c10/xpu/**/*.cpp', + 'c10/cuda/CUDAAlgorithm.h', 'c10/util/complex_math.h', 'c10/util/complex_utils.h', 'c10/util/flat_hash_map.h', @@ -266,14 +222,13 @@ exclude_patterns = [ 'c10/util/SmallVector.h', 'c10/util/win32-headers.h', 'c10/util/*inl.h', + 'c10/test/**/*.h', 'aten/src/ATen/core/TensorImpl_test.cpp', 'third_party/**/*', 'torch/csrc/api/**', 'torch/csrc/autograd/generated/**', - 'torch/csrc/autograd/profiler_legacy.cpp', - 'torch/csrc/cuda/**', - 'torch/csrc/dynamo/*', 'torch/csrc/distributed/**/*', + 'torch/csrc/dynamo/eval_frame.h', 'torch/csrc/inductor/**/*', 'torch/csrc/jit/**/*', 'torch/csrc/jit/serialization/import_legacy.cpp', @@ -322,6 +277,26 @@ command = [ '@{{PATHSFILE}}' ] +[[linter]] +code = 'TYPENOSKIP' +include_patterns = ['mypy.ini'] +command = [ + 'python3', + 'tools/linter/adapters/grep_linter.py', + '--pattern=follow_imports\s*=\s*skip', + '--linter-name=TYPENOSKIP', + '--error-name=use of follow_imports = skip', + """--error-description=\ + follow_imports = skip is forbidden from mypy.ini configuration as it \ + is extremely easy to accidentally turn off type checking unintentionally. If \ + you need to suppress type errors, use a top level # mypy: ignore-errors. \ + Do not rely on automatic Any substitution; instead, manually # type: ignore \ + at use sites or define a pyi type stub with more relaxed types. \ + """, + '--', + '@{{PATHSFILE}}' +] + [[linter]] code = 'NOQA' include_patterns = ['**/*.py', '**/*.pyi'] @@ -1006,7 +981,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'PyYAML==6.0', + 'PyYAML==6.0.1', ] # Black + usort @@ -1040,424 +1015,9 @@ exclude_patterns = [ 'test/_nvfuser/test_dynamo.py', 'test/_nvfuser/test_python_frontend.py', 'test/_nvfuser/test_torchscript.py', - 'test/_test_bazel.py', - 'test/ao/sparsity/test_activation_sparsifier.py', - 'test/ao/sparsity/test_composability.py', - 'test/ao/sparsity/test_data_scheduler.py', - 'test/ao/sparsity/test_data_sparsifier.py', - 'test/ao/sparsity/test_kernels.py', - 'test/ao/sparsity/test_parametrization.py', - 'test/ao/sparsity/test_qlinear_packed_params.py', - 'test/ao/sparsity/test_scheduler.py', - 'test/ao/sparsity/test_sparsifier.py', - 'test/ao/sparsity/test_sparsity_utils.py', - 'test/ao/sparsity/test_structured_sparsifier.py', - 'test/autograd/test_complex.py', - 'test/autograd/test_fallback.py', - 'test/autograd/test_functional.py', - 'test/backends/xeon/test_launch.py', - 'test/benchmark_utils/test_benchmark_utils.py', - 'test/bottleneck_test/test.py', - 'test/bottleneck_test/test_args.py', - 'test/bottleneck_test/test_cuda.py', - 'test/conftest.py', - 'test/cpp/__init__.py', - 'test/cpp/aot_inductor/test.py', - 'test/cpp/api/init_baseline.py', - 'test/cpp/api/optim_baseline.py', - 'test/cpp/jit/__init__.py', - 'test/cpp/jit/tests_setup.py', - 'test/cpp_api_parity/__init__.py', - 'test/cpp_api_parity/functional_impl_check.py', - 'test/cpp_api_parity/module_impl_check.py', - 'test/cpp_api_parity/parity_table_parser.py', - 'test/cpp_api_parity/sample_functional.py', - 'test/cpp_api_parity/sample_module.py', - 'test/cpp_api_parity/utils.py', - 'test/cpp_extensions/no_python_abi_suffix_test/setup.py', - 'test/cpp_extensions/setup.py', - 'test/cpp_extensions/torch_test_cpp_extension/__init__.py', - 'test/create_dummy_torchscript_model.py', - 'test/custom_backend/backend.py', - 'test/custom_backend/test_custom_backend.py', - 'test/custom_operator/model.py', - 'test/custom_operator/test_custom_ops.py', 'test/delete.py', - 'test/distributed/_shard/sharded_optim/test_sharded_optim.py', - 'test/distributed/_shard/sharded_tensor/ops/test_binary_cmp.py', - 'test/distributed/_shard/sharded_tensor/ops/test_embedding.py', - 'test/distributed/_shard/sharded_tensor/ops/test_embedding_bag.py', - 'test/distributed/_shard/sharded_tensor/ops/test_init.py', - 'test/distributed/_shard/sharded_tensor/ops/test_tensor_ops.py', - 'test/distributed/_shard/sharded_tensor/test_logger.py', - 'test/distributed/_shard/sharded_tensor/test_sharded_tensor.py', - 'test/distributed/_shard/sharded_tensor/test_sharded_tensor_reshard.py', - 'test/distributed/_shard/sharding_plan/test_sharding_plan.py', - 'test/distributed/_shard/sharding_spec/test_sharding_spec.py', - 'test/distributed/_shard/test_sharder.py', - 'test/distributed/_tools/test_memory_tracker.py', - 'test/distributed/algorithms/ddp_comm_hooks/test_ddp_hooks.py', - 'test/distributed/algorithms/quantization/test_quantization.py', - 'test/distributed/algorithms/test_join.py', - 'test/distributed/argparse_util_test.py', - 'test/distributed/bin/test_script.py', - 'test/distributed/elastic/agent/server/test/__init__.py', - 'test/distributed/elastic/agent/server/test/api_test.py', - 'test/distributed/elastic/agent/server/test/local_elastic_agent_test.py', - 'test/distributed/elastic/events/lib_test.py', - 'test/distributed/elastic/metrics/__init__.py', - 'test/distributed/elastic/metrics/api_test.py', - 'test/distributed/elastic/multiprocessing/api_test.py', - 'test/distributed/elastic/multiprocessing/bin/echo1.py', - 'test/distributed/elastic/multiprocessing/bin/echo2.py', - 'test/distributed/elastic/multiprocessing/bin/echo3.py', - 'test/distributed/elastic/multiprocessing/bin/test_script.py', - 'test/distributed/elastic/multiprocessing/bin/zombie_test.py', - 'test/distributed/elastic/multiprocessing/errors/api_test.py', - 'test/distributed/elastic/multiprocessing/errors/error_handler_test.py', - 'test/distributed/elastic/multiprocessing/redirects_test.py', - 'test/distributed/elastic/multiprocessing/tail_log_test.py', - 'test/distributed/elastic/rendezvous/__init__.py', - 'test/distributed/elastic/rendezvous/api_test.py', - 'test/distributed/elastic/rendezvous/c10d_rendezvous_backend_test.py', - 'test/distributed/elastic/rendezvous/dynamic_rendezvous_test.py', - 'test/distributed/elastic/rendezvous/etcd_rendezvous_backend_test.py', - 'test/distributed/elastic/rendezvous/etcd_rendezvous_test.py', - 'test/distributed/elastic/rendezvous/etcd_server_test.py', - 'test/distributed/elastic/rendezvous/rendezvous_backend_test.py', - 'test/distributed/elastic/rendezvous/static_rendezvous_test.py', - 'test/distributed/elastic/rendezvous/utils_test.py', - 'test/distributed/elastic/timer/__init__.py', - 'test/distributed/elastic/timer/api_test.py', - 'test/distributed/elastic/timer/file_based_local_timer_test.py', - 'test/distributed/elastic/timer/local_timer_example.py', - 'test/distributed/elastic/timer/local_timer_test.py', - 'test/distributed/elastic/utils/__init__.py', - 'test/distributed/elastic/utils/data/__init__.py', - 'test/distributed/elastic/utils/data/cycling_iterator_test.py', - 'test/distributed/elastic/utils/distributed_test.py', - 'test/distributed/elastic/utils/logging_test.py', - 'test/distributed/elastic/utils/util_test.py', - 'test/distributed/launcher/__init__.py', - 'test/distributed/launcher/api_test.py', - 'test/distributed/launcher/bin/test_script.py', - 'test/distributed/launcher/bin/test_script_init_method.py', - 'test/distributed/launcher/bin/test_script_is_torchelastic_launched.py', - 'test/distributed/launcher/bin/test_script_local_rank.py', - 'test/distributed/launcher/launch_test.py', - 'test/distributed/launcher/run_test.py', - 'test/distributed/nn/jit/__init__.py', - 'test/distributed/nn/jit/test_instantiator.py', - 'test/distributed/optim/test_apply_optimizer_in_backward.py', - 'test/distributed/optim/test_named_optimizer.py', - 'test/distributed/optim/test_zero_redundancy_optimizer.py', - 'test/distributed/pipeline/sync/__init__.py', - 'test/distributed/pipeline/sync/conftest.py', - 'test/distributed/pipeline/sync/skip/__init__.py', - 'test/distributed/pipeline/sync/skip/test_api.py', - 'test/distributed/pipeline/sync/skip/test_gpipe.py', - 'test/distributed/pipeline/sync/skip/test_inspect_skip_layout.py', - 'test/distributed/pipeline/sync/skip/test_leak.py', - 'test/distributed/pipeline/sync/skip/test_portal.py', - 'test/distributed/pipeline/sync/skip/test_stash_pop.py', - 'test/distributed/pipeline/sync/skip/test_tracker.py', - 'test/distributed/pipeline/sync/skip/test_verify_skippables.py', - 'test/distributed/pipeline/sync/test_balance.py', - 'test/distributed/pipeline/sync/test_bugs.py', - 'test/distributed/pipeline/sync/test_checkpoint.py', - 'test/distributed/pipeline/sync/test_copy.py', - 'test/distributed/pipeline/sync/test_deferred_batch_norm.py', - 'test/distributed/pipeline/sync/test_dependency.py', - 'test/distributed/pipeline/sync/test_inplace.py', - 'test/distributed/pipeline/sync/test_microbatch.py', - 'test/distributed/pipeline/sync/test_phony.py', - 'test/distributed/pipeline/sync/test_pipe.py', - 'test/distributed/pipeline/sync/test_pipeline.py', - 'test/distributed/pipeline/sync/test_stream.py', - 'test/distributed/pipeline/sync/test_transparency.py', - 'test/distributed/pipeline/sync/test_worker.py', - 'test/distributed/rpc/cuda/test_tensorpipe_agent.py', - 'test/distributed/rpc/test_faulty_agent.py', - 'test/distributed/rpc/test_share_memory.py', - 'test/distributed/rpc/test_tensorpipe_agent.py', - 'test/distributed/tensor/parallel/__init__.py', - 'test/distributed/tensor/parallel/test_ddp_2d_parallel.py', - 'test/distributed/tensor/parallel/test_fsdp_2d_parallel.py', - 'test/distributed/tensor/parallel/test_parallelize_api.py', - 'test/distributed/tensor/parallel/test_tp_examples.py', - 'test/distributed/tensor/parallel/test_tp_random_state.py', - 'test/distributed/tensor/parallel/test_tp_style.py', - 'test/distributed/tensor/parallel/test_view_sharding_dim_change.py', - 'test/distributed/test_c10d_common.py', - 'test/distributed/test_c10d_gloo.py', - 'test/distributed/test_c10d_logger.py', - 'test/distributed/test_c10d_nccl.py', - 'test/distributed/test_c10d_object_collectives.py', - 'test/distributed/test_c10d_pypg.py', - 'test/distributed/test_c10d_spawn.py', - 'test/distributed/test_c10d_spawn_gloo.py', - 'test/distributed/test_c10d_spawn_nccl.py', - 'test/distributed/test_c10d_spawn_ucc.py', - 'test/distributed/test_c10d_ucc.py', - 'test/distributed/test_collective_utils.py', - 'test/distributed/test_data_parallel.py', - 'test/distributed/test_distributed_spawn.py', - 'test/distributed/test_dynamo_distributed.py', - 'test/distributed/test_fake_pg.py', - 'test/distributed/test_functional_api.py', - 'test/distributed/test_inductor_collectives.py', - 'test/distributed/test_launcher.py', - 'test/distributed/test_multi_threaded_pg.py', - 'test/distributed/test_nccl.py', - 'test/distributed/test_pg_wrapper.py', - 'test/distributed/test_store.py', - 'test/distributions/test_constraints.py', - 'test/distributions/test_distributions.py', - 'test/distributions/test_transforms.py', - 'test/distributions/test_utils.py', - 'test/error_messages/storage.py', 'test/expect/__init__.py', - 'test/export/test_db.py', - 'test/export/test_export.py', - 'test/export/test_funtionalized_assertions.py', - 'test/export/test_pass_infra.py', - 'test/export/test_passes.py', - 'test/export/test_serialize.py', - 'test/export/test_upgrade.py', - 'test/export/test_verifier.py', - 'test/export/test_unflatten.py', - 'test/forward_backward_compatibility/check_forward_backward_compatibility.py', - 'test/forward_backward_compatibility/dump_all_function_schemas.py', - 'test/functorch/attn_ft.py', - 'test/functorch/attn_positional.py', - 'test/functorch/common_utils.py', - 'test/functorch/discover_coverage.py', - 'test/functorch/functorch_additional_op_db.py', - 'test/functorch/test_aotdispatch.py', - 'test/functorch/test_control_flow.py', - 'test/functorch/test_dims.py', - 'test/functorch/test_eager_transforms.py', - 'test/functorch/test_logging.py', - 'test/functorch/test_memory_efficient_fusion.py', - 'test/functorch/test_minifier.py', - 'test/functorch/test_ops.py', - 'test/functorch/test_parsing.py', - 'test/functorch/test_rearrange.py', - 'test/functorch/test_vmap.py', - 'test/functorch/test_vmap_registrations.py', - 'test/functorch/xfail_suggester.py', - 'test/fx/named_tup.py', - 'test/fx/quantization.py', - 'test/fx/test_common_passes.py', - 'test/fx/test_cse_pass.py', - 'test/fx/test_dce_pass.py', - 'test/fx/test_future.py', - 'test/fx/test_fx_const_fold.py', - 'test/fx/test_fx_param_shape_control_flow.py', - 'test/fx/test_gradual_type.py', - 'test/fx/test_matcher_utils.py', - 'test/fx/test_pass_infra.py', - 'test/fx/test_source_matcher_utils.py', - 'test/fx/test_subgraph_rewriter.py', - 'test/fx/test_z3_gradual_types.py', - 'test/fx/test_fx_split.py', - 'test/jit/__init__.py', - 'test/jit/_imported_class_test/__init__.py', - 'test/jit/_imported_class_test/bar.py', - 'test/jit/_imported_class_test/foo.py', - 'test/jit/_imported_class_test/very/__init__.py', - 'test/jit/_imported_class_test/very/very/__init__.py', - 'test/jit/_imported_class_test/very/very/nested.py', - 'test/jit/fixtures_srcs/__init__.py', - 'test/jit/fixtures_srcs/fixtures_src.py', - 'test/jit/fixtures_srcs/generate_models.py', - 'test/jit/fixtures_srcs/test_upgrader_models_generation.py', - 'test/jit/myexception.py', - 'test/jit/test_alias_analysis.py', - 'test/jit/test_async.py', - 'test/jit/test_aten_pow.py', - 'test/jit/test_attr.py', - 'test/jit/test_autodiff.py', - 'test/jit/test_autodiff_subgraph_slicing.py', - 'test/jit/test_await.py', - 'test/jit/test_backend_nnapi.py', - 'test/jit/test_backends.py', - 'test/jit/test_batch_mm.py', - 'test/jit/test_builtins.py', - 'test/jit/test_class_type.py', - 'test/jit/test_complex.py', - 'test/jit/test_complexity.py', - 'test/jit/test_convert_activation.py', - 'test/jit/test_cuda.py', - 'test/jit/test_custom_operators.py', - 'test/jit/test_data_parallel.py', - 'test/jit/test_dataclasses.py', - 'test/jit/test_dce.py', - 'test/jit/test_device_analysis.py', - 'test/jit/test_dtype_analysis.py', - 'test/jit/test_enum.py', - 'test/jit/test_exception.py', - 'test/jit/test_freezing.py', - 'test/jit/test_functional_blocks.py', - 'test/jit/test_fuser_common.py', - 'test/jit/test_graph_rewrite_passes.py', - 'test/jit/test_hash.py', - 'test/jit/test_hooks.py', - 'test/jit/test_hooks_modules.py', - 'test/jit/test_ignorable_args.py', - 'test/jit/test_ignore_context_manager.py', - 'test/jit/test_isinstance.py', - 'test/jit/test_jit_utils.py', - 'test/jit/test_list_dict.py', - 'test/jit/test_logging.py', - 'test/jit/test_misc.py', - 'test/jit/test_models.py', - 'test/jit/test_module_apis.py', - 'test/jit/test_module_containers.py', - 'test/jit/test_module_interface.py', - 'test/jit/test_modules.py', - 'test/jit/test_op_decompositions.py', - 'test/jit/test_optimize_for_mobile_preserve_debug_info.py', - 'test/jit/test_parametrization.py', - 'test/jit/test_pdt.py', - 'test/jit/test_peephole.py', - 'test/jit/test_profiler.py', - 'test/jit/test_python_bindings.py', - 'test/jit/test_python_builtins.py', - 'test/jit/test_python_ir.py', - 'test/jit/test_recursive_script.py', - 'test/jit/test_remove_mutation.py', - 'test/jit/test_save_load.py', - 'test/jit/test_save_load_for_op_version.py', - 'test/jit/test_script_profile.py', - 'test/jit/test_scriptmod_ann.py', - 'test/jit/test_slice.py', - 'test/jit/test_sparse.py', - 'test/jit/test_string_formatting.py', - 'test/jit/test_symbolic_shape_analysis.py', - 'test/jit/test_tensor_creation_ops.py', - 'test/jit/test_tensor_methods.py', - 'test/jit/test_torchbind.py', - 'test/jit/test_tracer.py', - 'test/jit/test_type_sharing.py', - 'test/jit/test_types.py', - 'test/jit/test_typing.py', - 'test/jit/test_union.py', - 'test/jit/test_unsupported_ops.py', - 'test/jit/test_upgraders.py', - 'test/jit/test_warn.py', - 'test/jit/test_with.py', - 'test/jit/xnnpack/test_xnnpack_delegate.py', - 'test/jit_hooks/model.py', - 'test/lazy/__init__.py', - 'test/lazy/test_bindings.py', - 'test/lazy/test_debug_util.py', - 'test/lazy/test_extract_compiled_graph.py', - 'test/lazy/test_meta_kernel.py', - 'test/lazy/test_reuse_ir.py', - 'test/lazy/test_step_closures.py', - 'test/lazy/test_ts_opinfo.py', - 'test/linear.py', - 'test/load_torchscript_model.py', - 'test/mkl_verbose.py', - 'test/mkldnn_verbose.py', - 'test/mobile/custom_build/prepare_model.py', - 'test/mobile/lightweight_dispatch/tests_setup.py', - 'test/mobile/model_test/android_api_module.py', - 'test/mobile/model_test/builtin_ops.py', - 'test/mobile/model_test/gen_test_model.py', - 'test/mobile/model_test/math_ops.py', - 'test/mobile/model_test/nn_ops.py', - 'test/mobile/model_test/quantization_ops.py', - 'test/mobile/model_test/sampling_ops.py', - 'test/mobile/model_test/tensor_ops.py', - 'test/mobile/model_test/torchvision_models.py', - 'test/mobile/model_test/update_production_ops.py', - 'test/mobile/nnc/aot_test_model.py', - 'test/mobile/test_bytecode.py', - 'test/mobile/test_lite_script_module.py', - 'test/mobile/test_lite_script_type.py', - 'test/mobile/test_quantize_fx_lite_script_module.py', - 'test/mobile/test_upgrader_codegen.py', - 'test/mobile/test_upgraders.py', - 'test/nn/test_convolution.py', - 'test/nn/test_dropout.py', - 'test/nn/test_embedding.py', - 'test/nn/test_init.py', - 'test/nn/test_lazy_modules.py', - 'test/nn/test_module_hooks.py', - 'test/nn/test_multihead_attention.py', - 'test/nn/test_packed_sequence.py', - 'test/nn/test_parametrization.py', - 'test/nn/test_pooling.py', - 'test/nn/test_pruning.py', - 'test/onnx_caffe2/export_onnx_tests_filter.py', - 'test/onnx_caffe2/export_onnx_tests_generator.py', - 'test/onnx_caffe2/test_caffe2_common.py', - 'test/onnx_caffe2/test_custom_ops.py', - 'test/onnx_caffe2/test_pytorch_helper.py', - 'test/onnx_caffe2/test_pytorch_onnx_caffe2.py', - 'test/onnx_caffe2/test_pytorch_onnx_caffe2_quantized.py', - 'test/onnx_caffe2/test_verify.py', - 'test/optim/test_lrscheduler.py', - 'test/optim/test_optim.py', - 'test/optim/test_swa_utils.py', - 'test/package/__init__.py', - 'test/package/common.py', - 'test/package/generate_bc_packages.py', - 'test/package/module_a.py', - 'test/package/module_a_remapped_path.py', - 'test/package/package_a/__init__.py', - 'test/package/package_a/fake_interface.py', - 'test/package/package_a/fake_script_class.py', - 'test/package/package_a/long_name.py', - 'test/package/package_a/std_sys_module_hacks.py', - 'test/package/package_a/subpackage.py', - 'test/package/package_a/test_all_leaf_modules_tracer.py', - 'test/package/package_a/test_module.py', - 'test/package/package_a/test_nn_module.py', - 'test/package/package_a/use_dunder_package.py', - 'test/package/package_a/use_torch_package_importer.py', - 'test/package/package_b/__init__.py', - 'test/package/package_b/subpackage_0/__init__.py', - 'test/package/package_b/subpackage_0/subsubpackage_0/__init__.py', - 'test/package/package_b/subpackage_1.py', - 'test/package/package_b/subpackage_2.py', - 'test/package/package_c/__init__.py', - 'test/package/package_c/test_module.py', - 'test/package/package_d/__init__.py', - 'test/package/package_d/imports_directly.py', - 'test/package/package_d/imports_indirectly.py', - 'test/package/package_d/subpackage_0/__init__.py', - 'test/package/package_d/subpackage_0/subsubpackage_0/__init__.py', - 'test/package/test_analyze.py', - 'test/package/test_dependency_api.py', - 'test/package/test_dependency_hooks.py', - 'test/package/test_digraph.py', - 'test/package/test_directory_reader.py', - 'test/package/test_glob_group.py', - 'test/package/test_importer.py', - 'test/package/test_load_bc_packages.py', - 'test/package/test_mangling.py', - 'test/package/test_misc.py', - 'test/package/test_model.py', - 'test/package/test_package_fx.py', - 'test/package/test_package_script.py', - 'test/package/test_repackage.py', - 'test/package/test_resources.py', - 'test/package/test_save_load.py', - 'test/package/test_trace_dep/__init__.py', - 'test/profiler/test_memory_profiler.py', - 'test/profiler/test_profiler.py', - 'test/profiler/test_profiler_tree.py', 'test/quantization/__init__.py', - 'test/quantization/ao_migration/__init__.py', - 'test/quantization/ao_migration/common.py', - 'test/quantization/ao_migration/test_ao_migration.py', - 'test/quantization/ao_migration/test_quantization.py', - 'test/quantization/ao_migration/test_quantization_fx.py', - 'test/quantization/bc/__init__.py', - 'test/quantization/bc/test_backward_compatibility.py', 'test/quantization/core/__init__.py', 'test/quantization/core/experimental/apot_fx_graph_mode_ptq.py', 'test/quantization/core/experimental/apot_fx_graph_mode_qat.py', @@ -1492,54 +1052,12 @@ exclude_patterns = [ 'test/quantization/fx/test_numeric_suite_fx.py', 'test/quantization/fx/test_quantize_fx.py', 'test/quantization/fx/test_subgraph_rewriter.py', - 'test/quantization/jit/__init__.py', - 'test/quantization/jit/test_deprecated_jit_quant.py', - 'test/quantization/jit/test_fusion_passes.py', - 'test/quantization/jit/test_ondevice_quantization.py', - 'test/quantization/jit/test_quantize_jit.py', - 'test/quantization/pt2e/test_graph_utils.py', - 'test/quantization/pt2e/test_quantize_pt2e.py', - 'test/quantization/pt2e/test_x86inductor_quantizer.py', - 'test/scripts/cuda_memcheck_common.py', - 'test/scripts/run_cuda_memcheck.py', - 'test/simulate_nccl_errors.py', - 'test/test_ao_sparsity.py', - 'test/test_autocast.py', - 'test/test_autograd.py', - 'test/test_binary_ufuncs.py', - 'test/test_bundled_images.py', - 'test/test_bundled_inputs.py', - 'test/test_comparison_utils.py', - 'test/test_compile_benchmark_util.py', - 'test/test_complex.py', - 'test/test_cpp_api_parity.py', - 'test/test_cpp_extensions_aot.py', - 'test/test_cpp_extensions_jit.py', - 'test/test_cpp_extensions_open_device_registration.py', - 'test/test_cuda.py', - 'test/test_cuda_expandable_segments.py', - 'test/test_cuda_multigpu.py', - 'test/test_cuda_nvml_based_avail.py', - 'test/test_cuda_primary_ctx.py', - 'test/test_cuda_sanitizer.py', - 'test/test_cuda_trace.py', - 'test/test_custom_op_testing.py', - 'test/test_dataloader.py', 'test/test_datapipe.py', - 'test/test_decomp.py', - 'test/test_deploy.py', - 'test/test_determination.py', - 'test/test_dispatch.py', - 'test/test_dlpack.py', - 'test/test_dynamic_shapes.py', - 'test/test_expanded_weights.py', 'test/test_fake_tensor.py', 'test/test_flop_counter.py', - 'test/test_foreach.py', 'test/test_function_schema.py', 'test/test_functional_autograd_benchmark.py', 'test/test_functional_optim.py', - 'test/test_functionalization.py', 'test/test_functionalization_of_rng_ops.py', 'test/test_futures.py', 'test/test_fx.py', @@ -1548,7 +1066,6 @@ exclude_patterns = [ 'test/test_fx_reinplace_pass.py', 'test/test_hub.py', 'test/test_import_stats.py', - 'test/test_indexing.py', 'test/test_itt.py', 'test/test_jit.py', 'test/test_jit_autocast.py', @@ -1564,7 +1081,6 @@ exclude_patterns = [ 'test/test_jit_string.py', 'test/test_jiterator.py', 'test/test_kernel_launch_checks.py', - 'test/test_legacy_vmap.py', 'test/test_license.py', 'test/test_linalg.py', 'test/test_logging.py', @@ -1579,11 +1095,9 @@ exclude_patterns = [ 'test/test_mkldnn_verbose.py', 'test/test_mobile_optimizer.py', 'test/test_model_dump.py', - 'test/test_module_init.py', 'test/test_modules.py', 'test/test_monitor.py', 'test/test_mps.py', - 'test/test_multiprocessing.py', 'test/test_multiprocessing_spawn.py', 'test/test_namedtensor.py', 'test/test_namedtuple_return_api.py', @@ -1597,10 +1111,6 @@ exclude_patterns = [ 'test/test_nvfuser_dynamo.py', 'test/test_nvfuser_frontend.py', 'test/test_openmp.py', - 'test/test_ops.py', - 'test/test_ops_fwd_gradients.py', - 'test/test_ops_gradients.py', - 'test/test_ops_jit.py', 'test/test_optim.py', 'test/test_out_dtype_op.py', 'test/test_overrides.py', @@ -1610,7 +1120,6 @@ exclude_patterns = [ 'test/test_proxy_tensor.py', 'test/test_pruning_op.py', 'test/test_public_bindings.py', - 'test/test_python_dispatch.py', 'test/test_quantization.py', 'test/test_reductions.py', 'test/test_scatter_gather_ops.py', @@ -1642,7 +1151,6 @@ exclude_patterns = [ 'test/test_type_promotion.py', 'test/test_unary_ufuncs.py', 'test/test_utils.py', - 'test/test_view_ops.py', 'test/test_vulkan.py', 'test/test_xnnpack_integration.py', 'test/torch_np/numpy_test/**/*.py', @@ -1719,24 +1227,6 @@ exclude_patterns = [ 'torch/_export/serde/upgrade.py', 'torch/_export/trace.py', 'torch/_export/verifier.py', - 'torch/_functorch/__init__.py', - 'torch/_functorch/aot_autograd.py', - 'torch/_functorch/apis.py', - 'torch/_functorch/autograd_function.py', - 'torch/_functorch/batch_norm_replacement.py', - 'torch/_functorch/benchmark_utils.py', - 'torch/_functorch/compile_utils.py', - 'torch/_functorch/compilers.py', - 'torch/_functorch/config.py', - 'torch/_functorch/deprecated.py', - 'torch/_functorch/eager_transforms.py', - 'torch/_functorch/fx_minifier.py', - 'torch/_functorch/partitioners.py', - 'torch/_functorch/pyfunctorch.py', - 'torch/_functorch/python_key.py', - 'torch/_functorch/top_operators_github_usage.py', - 'torch/_functorch/utils.py', - 'torch/_functorch/vmap.py', 'torch/_higher_order_ops/__init__.py', 'torch/_higher_order_ops/out_dtype.py', 'torch/_higher_order_ops/wrap.py', @@ -1934,6 +1424,7 @@ exclude_patterns = [ 'torch/compiler/__init__.py', 'torch/contrib/__init__.py', 'torch/contrib/_tensorboard_vis.py', + "torch/cuda/_gpu_trace.py", 'torch/cuda/_memory_viz.py', # mypy: Value of type "object" is not indexable 'torch/distributed/__init__.py', 'torch/distributed/_composable_state.py', @@ -2346,25 +1837,6 @@ exclude_patterns = [ 'torch/nn/utils/rnn.py', 'torch/nn/utils/spectral_norm.py', 'torch/nn/utils/weight_norm.py', - 'torch/optim/__init__.py', - 'torch/optim/_functional.py', - 'torch/optim/_multi_tensor/__init__.py', - 'torch/optim/adadelta.py', - 'torch/optim/adagrad.py', - 'torch/optim/adam.py', - 'torch/optim/adamax.py', - 'torch/optim/adamw.py', - 'torch/optim/asgd.py', - 'torch/optim/lbfgs.py', - 'torch/optim/lr_scheduler.py', - 'torch/optim/nadam.py', - 'torch/optim/optimizer.py', - 'torch/optim/radam.py', - 'torch/optim/rmsprop.py', - 'torch/optim/rprop.py', - 'torch/optim/sgd.py', - 'torch/optim/sparse_adam.py', - 'torch/optim/swa_utils.py', 'torch/overrides.py', 'torch/quasirandom.py', 'torch/random.py', @@ -2399,7 +1871,7 @@ exclude_patterns = [ 'torch/testing/_internal/common_subclass.py', 'torch/testing/_internal/common_utils.py', 'torch/testing/_internal/composite_compliance.py', - 'torch/testing/_internal/control_flow_opinfo_db.py', + 'torch/testing/_internal/hop_db.py', 'torch/testing/_internal/custom_op_db.py', 'torch/testing/_internal/data/__init__.py', 'torch/testing/_internal/data/network1.py', @@ -2455,13 +1927,10 @@ exclude_patterns = [ 'torch/testing/_internal/test_module/__init__.py', 'torch/testing/_internal/test_module/future_div.py', 'torch/testing/_internal/test_module/no_future_div.py', - 'torch/torch_version.py', - 'torch/types.py', 'torch/utils/__init__.py', 'torch/utils/_contextlib.py', 'torch/utils/_cpp_extension_versioner.py', 'torch/utils/_crash_handler.py', - 'torch/utils/_cuda_trace.py', 'torch/utils/_device.py', 'torch/utils/_foreach_utils.py', 'torch/utils/_freeze.py', @@ -2470,7 +1939,6 @@ exclude_patterns = [ 'torch/utils/_stats.py', 'torch/utils/_sympy/__init__.py', 'torch/utils/_sympy/functions.py', - 'torch/utils/_sympy/value_ranges.py', 'torch/utils/_traceback.py', 'torch/utils/_zip.py', 'torch/utils/backcompat/__init__.py', @@ -2590,6 +2058,7 @@ exclude_patterns = [ 'torch/utils/viz/__init__.py', 'torch/utils/viz/_cycles.py', 'torch/utils/weak.py', + 'torch/xpu/_gpu_trace.py', ] init_command = [ 'python3', @@ -2682,7 +2151,7 @@ init_command = [ 'python3', 'tools/linter/adapters/pip_init.py', '--dry-run={{DRYRUN}}', - 'ruff==0.1.11', + 'ruff==0.4.1', ] is_formatter = true @@ -2699,3 +2168,92 @@ command = [ '@{{PATHSFILE}}' ] is_formatter = true + +[[linter]] +code = 'ATEN_CPU_GPU_AGNOSTIC' +include_patterns = [ + # aten source + "aten/src/ATen/*.cpp", + "aten/src/ATen/cpu/*.cpp", + "aten/src/ATen/functorch/**/*.cpp", + "aten/src/ATen/nnapi/*.cpp", + "aten/src/ATen/quantized/*.cpp", + "aten/src/ATen/vulkan/*.cpp", + "aten/src/ATen/metal/*.cpp", + "aten/src/ATen/detail/CPUGuardImpl.cpp", + "aten/src/ATen/detail/MetaGuardImpl.cpp", + # aten native source + "aten/src/ATen/native/cpu/*.cpp", + "aten/src/ATen/native/ao_sparse/cpu/kernels/*.cpp", + "aten/src/ATen/native/ao_sparse/quantized/cpu/kernels/*.cpp", + "aten/src/ATen/native/quantized/cpu/kernels/*.cpp", + "aten/src/ATen/native/*.cpp", + "aten/src/ATen/native/cpu/**/*.cpp", + "aten/src/ATen/native/ao_sparse/*.cpp", + "aten/src/ATen/native/ao_sparse/**/*.cpp", + "aten/src/ATen/native/ao_sparse/quantized/*.cpp", + "aten/src/ATen/native/ao_sparse/quantized/**/*.cpp", + "aten/src/ATen/native/nested/*.cpp", + "aten/src/ATen/native/quantized/*.cpp", + "aten/src/ATen/native/quantized/**/*.cpp", + "aten/src/ATen/native/sparse/*.cpp", + "aten/src/ATen/native/transformers/*.cpp", + "aten/src/ATen/native/utils/*.cpp", + "aten/src/ATen/native/xnnpack/*.cpp", + "aten/src/ATen/native/metal/MetalPrepackOpRegister.cpp", + # aten headers + "aten/src/ATen/*.h", + "aten/src/ATen/functorch/**/*.h", + "aten/src/ATen/ops/*.h", + "aten/src/ATen/cpu/**/*.h", + "aten/src/ATen/nnapi/*.h", + "aten/src/ATen/quantized/*.h", + "aten/src/ATen/vulkan/*.h", + "aten/src/ATen/metal/*.h", + "aten/src/ATen/mps/*.h", + # aten native headers + "aten/src/ATen/native/*.h", + "aten/src/ATen/native/cpu/**/*.h", + "aten/src/ATen/native/nested/*.h", + "aten/src/ATen/native/sparse/*.h", + "aten/src/ATen/native/ao_sparse/*.h", + "aten/src/ATen/native/ao_sparse/cpu/*.h", + "aten/src/ATen/native/ao_sparse/quantized/*.h", + "aten/src/ATen/native/ao_sparse/quantized/cpu/*.h", + "aten/src/ATen/native/quantized/*.h", + "aten/src/ATen/native/quantized/cpu/*.h", + "aten/src/ATen/native/transformers/*.h", + "aten/src/ATen/native/quantized/cpu/qnnpack/include/*.h", + "aten/src/ATen/native/utils/*.h", + "aten/src/ATen/native/vulkan/ops/*.h", + "aten/src/ATen/native/xnnpack/*.h", + "aten/src/ATen/native/metal/MetalPrepackOpContext.h", + "aten/src/ATen/native/mps/Copy.h", + "aten/src/ATen/native/mkldnn/**/*.h", +] +exclude_patterns = [ + "aten/src/ATen/Context.h", + "aten/src/ATen/Context.cpp", + "aten/src/ATen/DLConvertor.cpp", + "aten/src/ATen/core/Array.h", + "aten/src/ATen/native/quantized/ConvUtils.h", + "aten/src/ATen/native/sparse/SparseBlasImpl.cpp", # triton implementation + "aten/src/ATen/native/transformers/attention.cpp", + "aten/src/ATen/native/**/cudnn/**", # cudnn is cuda specific +] +command = [ + 'python3', + 'tools/linter/adapters/grep_linter.py', + '--pattern=(^#if.*USE_ROCM.*)|(^#if.*USE_CUDA.*)', + '--linter-name=ATEN_CPU', + '--error-name=aten-cpu should be gpu agnostic', + """--error-description=\ + We strongly discourage the compile-time divergence \ + on ATen-CPU code for different GPU code. This \ + disallows sharing the same aten-cpu shared object \ + between different GPU backends \ + """, + '--', + '@{{PATHSFILE}}' +] +is_formatter = true diff --git a/BUILD.bazel b/BUILD.bazel index 0afee2d8d71c9..d3084d9ebd447 100644 --- a/BUILD.bazel +++ b/BUILD.bazel @@ -228,6 +228,7 @@ filegroup( [ "aten/src/ATen/cuda/*.cpp", "aten/src/ATen/cuda/detail/*.cpp", + "aten/src/ATen/cuda/tunable/*.cpp", "aten/src/ATen/cudnn/*.cpp", "aten/src/ATen/native/cuda/*.cpp", "aten/src/ATen/native/cuda/linalg/*.cpp", @@ -445,7 +446,6 @@ cu_library( # caffe2 CAFFE2_COPTS = COMMON_COPTS + [ "-Dcaffe2_EXPORTS", - "-DCAFFE2_USE_GLOO", "-DCAFFE2_USE_CUDNN", "-DCAFFE2_BUILD_MAIN_LIB", "-fvisibility-inlines-hidden", @@ -453,22 +453,6 @@ CAFFE2_COPTS = COMMON_COPTS + [ "-fno-trapping-math", ] -filegroup( - name = "caffe2_contrib_srcs", - srcs = [ - "caffe2/contrib/aten/aten_op.cc", - "caffe2/contrib/gloo/allgather_ops.cc", - "caffe2/contrib/gloo/allreduce_ops.cc", - "caffe2/contrib/gloo/barrier_ops.cc", - "caffe2/contrib/gloo/broadcast_ops.cc", - "caffe2/contrib/gloo/common.cc", - "caffe2/contrib/gloo/common_world_ops.cc", - "caffe2/contrib/gloo/context.cc", - "caffe2/contrib/gloo/reduce_scatter_ops.cc", - "caffe2/contrib/gloo/store_handler.cc", - ], -) - filegroup( name = "caffe2_core_srcs", srcs = [ @@ -519,363 +503,6 @@ filegroup( ], ) -filegroup( - name = "caffe2_distributed_srcs", - srcs = [ - "caffe2/distributed/file_store_handler.cc", - "caffe2/distributed/file_store_handler_op.cc", - "caffe2/distributed/store_handler.cc", - "caffe2/distributed/store_ops.cc", - ], -) - -filegroup( - name = "caffe2_ideep_srcs", - srcs = [ - "caffe2/ideep/operators/adam_op.cc", - "caffe2/ideep/operators/channel_shuffle_op.cc", - "caffe2/ideep/operators/concat_split_op.cc", - "caffe2/ideep/operators/conv_op.cc", - "caffe2/ideep/operators/conv_transpose_op.cc", - "caffe2/ideep/operators/dropout_op.cc", - "caffe2/ideep/operators/elementwise_sum_op.cc", - "caffe2/ideep/operators/expand_squeeze_dims_op.cc", - "caffe2/ideep/operators/fully_connected_op.cc", - "caffe2/ideep/operators/local_response_normalization_op.cc", - "caffe2/ideep/operators/momentum_sgd_op.cc", - "caffe2/ideep/operators/operator_fallback_ideep.cc", - "caffe2/ideep/operators/order_switch_ops.cc", - "caffe2/ideep/operators/pool_op.cc", - "caffe2/ideep/operators/quantization/int8_add_op.cc", - "caffe2/ideep/operators/quantization/int8_conv_op.cc", - "caffe2/ideep/operators/quantization/int8_dequantize_op.cc", - "caffe2/ideep/operators/quantization/int8_fully_connected_op.cc", - "caffe2/ideep/operators/quantization/int8_given_tensor_fill_op.cc", - "caffe2/ideep/operators/quantization/int8_pool_op.cc", - "caffe2/ideep/operators/quantization/int8_quantize_op.cc", - "caffe2/ideep/operators/quantization/int8_relu_op.cc", - "caffe2/ideep/operators/queue_ops.cc", - "caffe2/ideep/operators/relu_op.cc", - "caffe2/ideep/operators/reshape_op.cc", - "caffe2/ideep/operators/shape_op.cc", - "caffe2/ideep/operators/sigmoid_op.cc", - "caffe2/ideep/operators/spatial_batch_norm_op.cc", - "caffe2/ideep/operators/transpose_op.cc", - "caffe2/ideep/operators/utility_ops.cc", - "caffe2/ideep/utils/ideep_register.cc", - ], -) - -filegroup( - name = "caffe2_onnx_srcs", - srcs = [ - "caffe2/onnx/backend.cc", - "caffe2/onnx/backend_rep.cc", - "caffe2/onnx/device.cc", - "caffe2/onnx/helper.cc", - "caffe2/onnx/offline_tensor.cc", - "caffe2/onnx/onnx_exporter.cc", - "caffe2/onnx/onnxifi_graph_info.cc", - "caffe2/onnx/onnxifi_init.cc", - ], -) - -filegroup( - name = "caffe2_operators_srcs", - srcs = [ - "caffe2/operators/abs_op.cc", - "caffe2/operators/accumulate_op.cc", - "caffe2/operators/accuracy_op.cc", - "caffe2/operators/acos_op.cc", - "caffe2/operators/affine_channel_op.cc", - "caffe2/operators/alias_with_name.cc", - "caffe2/operators/apmeter_op.cc", - "caffe2/operators/arg_ops.cc", - "caffe2/operators/asin_op.cc", - "caffe2/operators/assert_op.cc", - "caffe2/operators/atan_op.cc", - "caffe2/operators/atomic_ops.cc", - "caffe2/operators/batch_box_cox_op.cc", - "caffe2/operators/batch_bucketize_op.cc", - "caffe2/operators/batch_gather_ops.cc", - "caffe2/operators/batch_matmul_op.cc", - "caffe2/operators/batch_moments_op.cc", - "caffe2/operators/batch_permutation_op.cc", - "caffe2/operators/batch_sparse_to_dense_op.cc", - "caffe2/operators/bbox_transform_op.cc", - "caffe2/operators/bisect_percentile_op.cc", - "caffe2/operators/boolean_mask_ops.cc", - "caffe2/operators/boolean_unmask_ops.cc", - "caffe2/operators/box_with_nms_limit_op.cc", - "caffe2/operators/bucketize_op.cc", - "caffe2/operators/byte_weight_dequant_op.cc", - "caffe2/operators/cast_op.cc", - "caffe2/operators/cbrt_op.cc", - "caffe2/operators/cc_bmm_bg_op.cc", - "caffe2/operators/ceil_op.cc", - "caffe2/operators/channel_backprop_stats_op.cc", - "caffe2/operators/channel_shuffle_op.cc", - "caffe2/operators/channel_stats_op.cc", - "caffe2/operators/clip_op.cc", - "caffe2/operators/collect_and_distribute_fpn_rpn_proposals_op.cc", - "caffe2/operators/communicator_op.cc", - "caffe2/operators/concat_split_op.cc", - "caffe2/operators/conditional_op.cc", - "caffe2/operators/conv_gradient_op.cc", - "caffe2/operators/conv_op.cc", - "caffe2/operators/conv_op_eigen.cc", - "caffe2/operators/conv_op_shared.cc", - "caffe2/operators/conv_transpose_gradient_op.cc", - "caffe2/operators/conv_transpose_op.cc", - "caffe2/operators/conv_transpose_op_mobile.cc", - "caffe2/operators/copy_op.cc", - "caffe2/operators/copy_rows_to_tensor_op.cc", - "caffe2/operators/cos_op.cc", - "caffe2/operators/cosh_op.cc", - "caffe2/operators/cosine_embedding_criterion_op.cc", - "caffe2/operators/counter_ops.cc", - "caffe2/operators/crash_op.cc", - "caffe2/operators/create_scope_op.cc", - "caffe2/operators/crf_viterbi_op.cc", - "caffe2/operators/cross_entropy_op.cc", - "caffe2/operators/ctc_beam_search_decoder_op.cc", - "caffe2/operators/ctc_greedy_decoder_op.cc", - "caffe2/operators/cube_op.cc", - "caffe2/operators/data_couple.cc", - "caffe2/operators/dataset_ops.cc", - "caffe2/operators/deform_conv_gradient_op.cc", - "caffe2/operators/deform_conv_op.cc", - "caffe2/operators/dense_vector_to_id_list_op.cc", - "caffe2/operators/distance_op.cc", - "caffe2/operators/do_op.cc", - "caffe2/operators/dropout_op.cc", - "caffe2/operators/elementwise_add_gradient_op.cc", - "caffe2/operators/elementwise_add_op.cc", - "caffe2/operators/elementwise_div_gradient_op.cc", - "caffe2/operators/elementwise_div_op.cc", - "caffe2/operators/elementwise_linear_op.cc", - "caffe2/operators/elementwise_logical_ops.cc", - "caffe2/operators/elementwise_mul_gradient_op.cc", - "caffe2/operators/elementwise_mul_op.cc", - "caffe2/operators/elementwise_ops.cc", - "caffe2/operators/elementwise_ops_schema.cc", - "caffe2/operators/elementwise_ops_utils.cc", - "caffe2/operators/elementwise_sub_gradient_op.cc", - "caffe2/operators/elementwise_sub_op.cc", - "caffe2/operators/elementwise_sum_op.cc", - "caffe2/operators/elu_op.cc", - "caffe2/operators/enforce_finite_op.cc", - "caffe2/operators/ensure_clipped_op.cc", - "caffe2/operators/ensure_cpu_output_op.cc", - "caffe2/operators/erf_op.cc", - "caffe2/operators/exp_op.cc", - "caffe2/operators/expand_op.cc", - "caffe2/operators/expand_squeeze_dims_op.cc", - "caffe2/operators/fc_inference.cc", - "caffe2/operators/feature_maps_ops.cc", - "caffe2/operators/feed_blob_op.cc", - "caffe2/operators/filler_op.cc", - "caffe2/operators/find_duplicate_elements_op.cc", - "caffe2/operators/find_op.cc", - "caffe2/operators/flatten_op.cc", - "caffe2/operators/flexible_top_k.cc", - "caffe2/operators/floor_op.cc", - "caffe2/operators/free_op.cc", - "caffe2/operators/fully_connected_op.cc", - "caffe2/operators/fused_rowwise_8bit_conversion_ops.cc", - "caffe2/operators/fused_rowwise_random_quantization_ops.cc", - "caffe2/operators/gather_fused_8bit_rowwise_op.cc", - "caffe2/operators/gather_op.cc", - "caffe2/operators/gather_ranges_to_dense_op.cc", - "caffe2/operators/gelu_op.cc", - "caffe2/operators/generate_proposals_op.cc", - "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cc", - "caffe2/operators/given_tensor_fill_op.cc", - "caffe2/operators/glu_op.cc", - "caffe2/operators/group_norm_op.cc", - "caffe2/operators/gru_unit_op.cc", - "caffe2/operators/h_softmax_op.cc", - "caffe2/operators/half_float_ops.cc", - "caffe2/operators/hard_sigmoid_op.cc", - "caffe2/operators/heatmap_max_keypoint_op.cc", - "caffe2/operators/if_op.cc", - "caffe2/operators/im2col_op.cc", - "caffe2/operators/index_hash_ops.cc", - "caffe2/operators/index_ops.cc", - "caffe2/operators/inference_lstm_op.cc", - "caffe2/operators/instance_norm_gradient_op.cc", - "caffe2/operators/instance_norm_op.cc", - "caffe2/operators/integral_image_op.cc", - "caffe2/operators/is_empty_op.cc", - "caffe2/operators/jsd_op.cc", - "caffe2/operators/key_split_ops.cc", - "caffe2/operators/last_n_window_collector.cc", - "caffe2/operators/layer_norm_op.cc", - "caffe2/operators/leaky_relu_op.cc", - "caffe2/operators/length_split_op.cc", - "caffe2/operators/lengths_pad_op.cc", - "caffe2/operators/lengths_reducer_fused_8bit_rowwise_ops.cc", - "caffe2/operators/lengths_reducer_ops.cc", - "caffe2/operators/lengths_reducer_rowwise_8bit_ops.cc", - "caffe2/operators/lengths_tile_op.cc", - "caffe2/operators/lengths_top_k_op.cc", - "caffe2/operators/listwise_l2r_op.cc", - "caffe2/operators/load_save_op.cc", - "caffe2/operators/load_save_op_util.cc", - "caffe2/operators/local_response_normalization_op.cc", - "caffe2/operators/locally_connected_op.cc", - "caffe2/operators/locally_connected_op_util.cc", - "caffe2/operators/log_op.cc", - "caffe2/operators/logit_op.cc", - "caffe2/operators/loss_op.cc", - "caffe2/operators/lp_pool_op.cc", - "caffe2/operators/lpnorm_op.cc", - "caffe2/operators/lstm_unit_op.cc", - "caffe2/operators/map_ops.cc", - "caffe2/operators/margin_ranking_criterion_op.cc", - "caffe2/operators/matmul_op.cc", - "caffe2/operators/mean_op.cc", - "caffe2/operators/merge_id_lists_op.cc", - "caffe2/operators/minmax_gradient_ops.cc", - "caffe2/operators/minmax_ops.cc", - "caffe2/operators/mod_op.cc", - "caffe2/operators/moments_op.cc", - "caffe2/operators/multi_class_accuracy_op.cc", - "caffe2/operators/negate_gradient_op.cc", - "caffe2/operators/negative_op.cc", - "caffe2/operators/ngram_ops.cc", - "caffe2/operators/norm_planar_yuv_op.cc", - "caffe2/operators/normalize_l1_op.cc", - "caffe2/operators/normalize_op.cc", - "caffe2/operators/numpy_tile_op.cc", - "caffe2/operators/one_hot_ops.cc", - "caffe2/operators/onnx_while_op.cc", - "caffe2/operators/order_switch_ops.cc", - "caffe2/operators/pack_rnn_sequence_op.cc", - "caffe2/operators/pack_segments.cc", - "caffe2/operators/pad_op.cc", - "caffe2/operators/partition_ops.cc", - "caffe2/operators/percentile_op.cc", - "caffe2/operators/perplexity_op.cc", - "caffe2/operators/piecewise_linear_transform_op.cc", - "caffe2/operators/pool_gradient_op.cc", - "caffe2/operators/pool_op.cc", - "caffe2/operators/pool_op_util.cc", - "caffe2/operators/pow_op.cc", - "caffe2/operators/prelu_op.cc", - "caffe2/operators/prepend_dim_op.cc", - "caffe2/operators/quant_decode_op.cc", - "caffe2/operators/rank_loss_op.cc", - "caffe2/operators/reciprocal_gradient_op.cc", - "caffe2/operators/reciprocal_op.cc", - "caffe2/operators/reduce_front_back_max_ops.cc", - "caffe2/operators/reduce_front_back_mean_ops.cc", - "caffe2/operators/reduce_front_back_sum_ops.cc", - "caffe2/operators/reduce_ops.cc", - "caffe2/operators/reduction_ops.cc", - "caffe2/operators/relu_n_op.cc", - "caffe2/operators/relu_op.cc", - "caffe2/operators/remove_data_blocks_op.cc", - "caffe2/operators/replace_nan_op.cc", - "caffe2/operators/reservoir_sampling.cc", - "caffe2/operators/reshape_op.cc", - "caffe2/operators/resize_3d_op.cc", - "caffe2/operators/resize_op.cc", - "caffe2/operators/reverse_packed_segs_op.cc", - "caffe2/operators/rmac_regions_op.cc", - "caffe2/operators/rnn/recurrent_network_blob_fetcher_op.cc", - "caffe2/operators/rnn/recurrent_network_executor.cc", - "caffe2/operators/rnn/recurrent_network_op.cc", - "caffe2/operators/roi_align_gradient_op.cc", - "caffe2/operators/roi_align_op.cc", - "caffe2/operators/roi_align_rotated_gradient_op.cc", - "caffe2/operators/roi_align_rotated_op.cc", - "caffe2/operators/roi_pool_op.cc", - "caffe2/operators/rowmul_op.cc", - "caffe2/operators/rsqrt_op.cc", - "caffe2/operators/scale_blobs_op.cc", - "caffe2/operators/scale_op.cc", - "caffe2/operators/segment_reduction_op.cc", - "caffe2/operators/selu_op.cc", - "caffe2/operators/sequence_ops.cc", - "caffe2/operators/shape_op.cc", - "caffe2/operators/sigmoid_gradient_op.cc", - "caffe2/operators/sigmoid_op.cc", - "caffe2/operators/sin_op.cc", - "caffe2/operators/sinh_op.cc", - "caffe2/operators/sinusoid_position_encoding_op.cc", - "caffe2/operators/slice_op.cc", - "caffe2/operators/softmax_op.cc", - "caffe2/operators/softmax_utils.cc", - "caffe2/operators/softmax_with_loss_op.cc", - "caffe2/operators/softplus_op.cc", - "caffe2/operators/softsign_op.cc", - "caffe2/operators/space_batch_op.cc", - "caffe2/operators/sparse_dropout_with_replacement_op.cc", - "caffe2/operators/sparse_normalize_op.cc", - "caffe2/operators/sparse_to_dense_mask_op.cc", - "caffe2/operators/sparse_to_dense_op.cc", - "caffe2/operators/spatial_batch_norm_gradient_op.cc", - "caffe2/operators/spatial_batch_norm_op.cc", - "caffe2/operators/spatial_softmax_with_loss_op.cc", - "caffe2/operators/sqr_op.cc", - "caffe2/operators/sqrt_op.cc", - "caffe2/operators/square_root_divide_op.cc", - "caffe2/operators/stats_ops.cc", - "caffe2/operators/stats_put_ops.cc", - "caffe2/operators/stop_gradient.cc", - "caffe2/operators/string_ops.cc", - "caffe2/operators/stump_func_op.cc", - "caffe2/operators/stylizer_ops.cc", - "caffe2/operators/summarize_op.cc", - "caffe2/operators/swish_op.cc", - "caffe2/operators/tan_op.cc", - "caffe2/operators/tanh_gradient_op.cc", - "caffe2/operators/tanh_op.cc", - "caffe2/operators/tensor_protos_db_input.cc", - "caffe2/operators/text_file_reader.cc", - "caffe2/operators/text_file_reader_utils.cc", - "caffe2/operators/thresholded_relu_op.cc", - "caffe2/operators/tile_op.cc", - "caffe2/operators/top_k.cc", - "caffe2/operators/transpose_op.cc", - "caffe2/operators/tt_linear_op.cc", - "caffe2/operators/unique_ops.cc", - "caffe2/operators/upsample_op.cc", - "caffe2/operators/utility_ops.cc", - "caffe2/operators/variable_length_sequence_padding.cc", - "caffe2/operators/weighted_multi_sampling_op.cc", - "caffe2/operators/weighted_sample_op.cc", - "caffe2/operators/while_op.cc", - "caffe2/operators/workspace_ops.cc", - "caffe2/operators/zero_gradient_op.cc", - ], -) - -filegroup( - name = "caffe2_opt_srcs", - srcs = [ - "caffe2/opt/annotations.cc", - "caffe2/opt/backend_cutting.cc", - "caffe2/opt/backend_transformer_base.cc", - "caffe2/opt/bound_shape_inferencer.cc", - "caffe2/opt/converter.cc", - "caffe2/opt/dead_code_elim.cc", - "caffe2/opt/device.cc", - "caffe2/opt/distributed.cc", - "caffe2/opt/distributed_converter.cc", - "caffe2/opt/fusion.cc", - "caffe2/opt/mobile.cc", - "caffe2/opt/onnxifi_op.cc", - "caffe2/opt/onnxifi_transformer.cc", - "caffe2/opt/optimize_ideep.cc", - "caffe2/opt/optimizer.cc", - "caffe2/opt/passes.cc", - "caffe2/opt/shape_info.cc", - "caffe2/opt/tvm_transformer.cc", - ], -) - filegroup( name = "caffe2_perfkernels_srcs", srcs = [ @@ -891,70 +518,6 @@ filegroup( ], ) -filegroup( - name = "caffe2_predictor_srcs", - srcs = [ - "caffe2/predictor/emulator/data_filler.cc", - "caffe2/predictor/emulator/data_filler.h", - "caffe2/predictor/predictor.cc", - "caffe2/predictor/predictor_config.cc", - "caffe2/predictor/predictor_utils.cc", - ], -) - -filegroup( - name = "caffe2_quantization_srcs", - srcs = [ - "caffe2/quantization/server/activation_distribution_observer.cc", - "caffe2/quantization/server/batch_matmul_dnnlowp_op.cc", - "caffe2/quantization/server/caffe2_dnnlowp_utils.cc", - "caffe2/quantization/server/channel_shuffle_dnnlowp_op.cc", - "caffe2/quantization/server/concat_dnnlowp_op.cc", - "caffe2/quantization/server/conv_dnnlowp_acc16_op.cc", - "caffe2/quantization/server/conv_dnnlowp_op.cc", - "caffe2/quantization/server/conv_relu_op.cc", - "caffe2/quantization/server/dequantize_dnnlowp_op.cc", - "caffe2/quantization/server/dnnlowp.cc", - "caffe2/quantization/server/dnnlowp_partition.cc", - "caffe2/quantization/server/dynamic_histogram.cc", - "caffe2/quantization/server/elementwise_add_dnnlowp_op.cc", - "caffe2/quantization/server/elementwise_linear_dnnlowp_op.cc", - "caffe2/quantization/server/elementwise_mul_dnnlowp_op.cc", - "caffe2/quantization/server/elementwise_sum_dnnlowp_op.cc", - "caffe2/quantization/server/elementwise_sum_relu_op.cc", - "caffe2/quantization/server/fbgemm_pack_matrix_cache.cc", - "caffe2/quantization/server/fbgemm_pack_op.cc", - "caffe2/quantization/server/fully_connected_dnnlowp_acc16_op.cc", - "caffe2/quantization/server/fully_connected_dnnlowp_op.cc", - "caffe2/quantization/server/fully_connected_fake_lowp_op.cc", - "caffe2/quantization/server/group_norm_dnnlowp_op.cc", - "caffe2/quantization/server/int8_gen_quant_params.cc", - "caffe2/quantization/server/kl_minimization.cc", - "caffe2/quantization/server/lstm_unit_dnnlowp_op.cc", - "caffe2/quantization/server/norm_minimization.cc", - "caffe2/quantization/server/p99.cc", - "caffe2/quantization/server/pool_dnnlowp_op.cc", - "caffe2/quantization/server/quantize_dnnlowp_op.cc", - "caffe2/quantization/server/relu_dnnlowp_op.cc", - "caffe2/quantization/server/sigmoid.cc", - "caffe2/quantization/server/sigmoid_dnnlowp_op.cc", - "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op.cc", - "caffe2/quantization/server/tanh.cc", - "caffe2/quantization/server/tanh_dnnlowp_op.cc", - "caffe2/quantization/server/utility_dnnlowp_ops.cc", - ], -) - -filegroup( - name = "caffe2_queue_srcs", - srcs = [ - "caffe2/queue/blobs_queue.cc", - "caffe2/queue/blobs_queue_db.cc", - "caffe2/queue/queue_ops.cc", - "caffe2/queue/rebatching_queue.cc", - "caffe2/queue/rebatching_queue_ops.cc", - ], -) filegroup( name = "caffe2_serialize_srcs", @@ -966,36 +529,6 @@ filegroup( ], ) -filegroup( - name = "caffe2_sgd_srcs", - srcs = [ - "caffe2/sgd/adadelta_op.cc", - "caffe2/sgd/adagrad_op.cc", - "caffe2/sgd/adam_op.cc", - "caffe2/sgd/clip_tensor_op.cc", - "caffe2/sgd/ftrl_op.cc", - "caffe2/sgd/gftrl_op.cc", - "caffe2/sgd/iter_op.cc", - "caffe2/sgd/lars_op.cc", - "caffe2/sgd/learning_rate_adaption_op.cc", - "caffe2/sgd/learning_rate_op.cc", - "caffe2/sgd/momentum_sgd_op.cc", - "caffe2/sgd/rmsprop_op.cc", - "caffe2/sgd/wngrad_op.cc", - "caffe2/sgd/yellowfin_op.cc", - ], -) - -filegroup( - name = "caffe2_transforms_srcs", - srcs = [ - "caffe2/transforms/common_subexpression_elimination.cc", - "caffe2/transforms/conv_to_nnpack_transform.cc", - "caffe2/transforms/pattern_net_transform.cc", - "caffe2/transforms/single_op_transform.cc", - ], -) - filegroup( name = "caffe2_utils_srcs", srcs = [ @@ -1020,228 +553,6 @@ filegroup( ], ) -filegroup( - name = "caffe2_cuda_cpp_srcs", - srcs = [ - "caffe2/contrib/aten/aten_op_gpu.cc", - "caffe2/contrib/gloo/allreduce_ops_gpu.cc", - "caffe2/contrib/gloo/broadcast_ops_gpu.cc", - "caffe2/contrib/gloo/common_world_ops_gpu.cc", - "caffe2/core/blob_serialization_gpu.cc", - "caffe2/core/common_cudnn.cc", - "caffe2/core/common_gpu.cc", - "caffe2/core/event_gpu.cc", - "caffe2/db/create_db_op_gpu.cc", - "caffe2/distributed/file_store_handler_op_gpu.cc", - "caffe2/operators/communicator_op_gpu.cc", - "caffe2/operators/concat_split_op_gpu.cc", - "caffe2/operators/conv_op_cache_cudnn.cc", - "caffe2/operators/conv_op_cudnn.cc", - "caffe2/operators/conv_op_gpu.cc", - "caffe2/operators/conv_op_shared_gpu.cc", - "caffe2/operators/conv_transpose_op_cudnn.cc", - "caffe2/operators/conv_transpose_op_gpu.cc", - "caffe2/operators/counter_ops_gpu.cc", - "caffe2/operators/do_op_gpu.cc", - "caffe2/operators/dropout_op_cudnn.cc", - "caffe2/operators/elementwise_add_op_gpu.cc", - "caffe2/operators/elementwise_sub_op_gpu.cc", - "caffe2/operators/elu_op_cudnn.cc", - "caffe2/operators/exp_op_gpu.cc", - "caffe2/operators/expand_op_gpu.cc", - "caffe2/operators/expand_squeeze_dims_op_gpu.cc", - "caffe2/operators/free_op_gpu.cc", - "caffe2/operators/fully_connected_op_gpu.cc", - "caffe2/operators/if_op_gpu.cc", - "caffe2/operators/im2col_op_gpu.cc", - "caffe2/operators/load_save_op_gpu.cc", - "caffe2/operators/local_response_normalization_op_cudnn.cc", - "caffe2/operators/locally_connected_op_gpu.cc", - "caffe2/operators/log_op_gpu.cc", - "caffe2/operators/matmul_op_gpu.cc", - "caffe2/operators/negate_gradient_op_gpu.cc", - "caffe2/operators/negative_op_gpu.cc", - "caffe2/operators/order_switch_ops_cudnn.cc", - "caffe2/operators/order_switch_ops_gpu.cc", - "caffe2/operators/pool_op_cudnn.cc", - "caffe2/operators/prepend_dim_op_gpu.cc", - "caffe2/operators/reshape_op_gpu.cc", - "caffe2/operators/rnn/recurrent_network_blob_fetcher_op_gpu.cc", - "caffe2/operators/rnn/recurrent_network_executor_gpu.cc", - "caffe2/operators/rnn/recurrent_op_cudnn.cc", - "caffe2/operators/scale_op_gpu.cc", - "caffe2/operators/shape_op_gpu.cc", - "caffe2/operators/sigmoid_op_cudnn.cc", - "caffe2/operators/softmax_op_cudnn.cc", - "caffe2/operators/sqr_op_gpu.cc", - "caffe2/operators/sqrt_op_gpu.cc", - "caffe2/operators/stop_gradient_gpu.cc", - "caffe2/operators/tanh_op_cudnn.cc", - "caffe2/operators/tensor_protos_db_input_gpu.cc", - "caffe2/operators/transpose_op_cudnn.cc", - "caffe2/operators/while_op_gpu.cc", - "caffe2/operators/zero_gradient_op_gpu.cc", - "caffe2/queue/queue_ops_gpu.cc", - "caffe2/sgd/iter_op_gpu.cc", - "caffe2/sgd/learning_rate_op_gpu.cc", - ], -) - -filegroup( - name = "caffe2_cu_srcs", - srcs = [ - "caffe2/core/context_gpu.cu", - "caffe2/operators/abs_op.cu", - "caffe2/operators/accumulate_op.cu", - "caffe2/operators/accuracy_op.cu", - "caffe2/operators/acos_op.cu", - "caffe2/operators/affine_channel_op.cu", - "caffe2/operators/alias_with_name.cu", - "caffe2/operators/arg_ops.cu", - "caffe2/operators/asin_op.cu", - "caffe2/operators/assert_op.cu", - "caffe2/operators/atan_op.cu", - "caffe2/operators/batch_gather_ops.cu", - "caffe2/operators/batch_matmul_op.cu", - "caffe2/operators/batch_moments_op.cu", - "caffe2/operators/batch_permutation_op.cu", - "caffe2/operators/batch_sparse_to_dense_op.cu", - "caffe2/operators/boolean_mask_ops.cu", - "caffe2/operators/boolean_unmask_ops.cu", - "caffe2/operators/bucketize_op.cu", - "caffe2/operators/cast_op.cu", - "caffe2/operators/cbrt_op.cu", - "caffe2/operators/ceil_op.cu", - "caffe2/operators/channel_backprop_stats_op.cu", - "caffe2/operators/channel_shuffle_op.cu", - "caffe2/operators/channel_stats_op.cu", - "caffe2/operators/channelwise_conv3d_op_cudnn.cu", - "caffe2/operators/clip_op.cu", - "caffe2/operators/copy_op.cu", - "caffe2/operators/cos_op.cu", - "caffe2/operators/cosh_op.cu", - "caffe2/operators/cosine_embedding_criterion_op.cu", - "caffe2/operators/cross_entropy_op.cu", - "caffe2/operators/cube_op.cu", - "caffe2/operators/data_couple_gpu.cu", - "caffe2/operators/deform_conv_op.cu", - "caffe2/operators/depthwise_3x3_conv_op_cudnn.cu", - "caffe2/operators/distance_op.cu", - "caffe2/operators/dropout_op.cu", - "caffe2/operators/elementwise_div_op.cu", - "caffe2/operators/elementwise_linear_op.cu", - "caffe2/operators/elementwise_mul_op.cu", - "caffe2/operators/elementwise_ops.cu", - "caffe2/operators/elu_op.cu", - "caffe2/operators/enforce_finite_op.cu", - "caffe2/operators/ensure_cpu_output_op.cu", - "caffe2/operators/erf_op.cu", - "caffe2/operators/filler_op.cu", - "caffe2/operators/find_op.cu", - "caffe2/operators/floor_op.cu", - "caffe2/operators/gather_op.cu", - "caffe2/operators/gelu_op.cu", - "caffe2/operators/generate_proposals_op.cu", - "caffe2/operators/generate_proposals_op_util_nms_gpu.cu", - "caffe2/operators/given_tensor_byte_string_to_uint8_fill_op.cu", - "caffe2/operators/given_tensor_fill_op.cu", - "caffe2/operators/glu_op.cu", - "caffe2/operators/group_norm_op.cu", - "caffe2/operators/gru_unit_op_gpu.cu", - "caffe2/operators/half_float_ops.cu", - "caffe2/operators/hard_sigmoid_op.cu", - "caffe2/operators/instance_norm_op.cu", - "caffe2/operators/integral_image_op.cu", - "caffe2/operators/layer_norm_op.cu", - "caffe2/operators/leaky_relu_op.cu", - "caffe2/operators/lengths_pad_op.cu", - "caffe2/operators/lengths_tile_op.cu", - "caffe2/operators/local_response_normalization_op.cu", - "caffe2/operators/logit_op.cu", - "caffe2/operators/loss_op.cu", - "caffe2/operators/lp_pool_op.cu", - "caffe2/operators/lstm_unit_op_gpu.cu", - "caffe2/operators/margin_ranking_criterion_op.cu", - "caffe2/operators/max_pool_with_index.cu", - "caffe2/operators/mean_op.cu", - "caffe2/operators/mem_query_op.cu", - "caffe2/operators/minmax_ops.cu", - "caffe2/operators/moments_op.cu", - "caffe2/operators/multi_class_accuracy_op.cu", - "caffe2/operators/normalize_ops.cu", - "caffe2/operators/one_hot_ops.cu", - "caffe2/operators/pack_segments.cu", - "caffe2/operators/pad_op_gpu.cu", - "caffe2/operators/perplexity_op.cu", - "caffe2/operators/piecewise_linear_transform_op.cu", - "caffe2/operators/pool_op.cu", - "caffe2/operators/pow_op.cu", - "caffe2/operators/prelu_op.cu", - "caffe2/operators/reciprocal_op.cu", - "caffe2/operators/reduce_front_back_max_ops.cu", - "caffe2/operators/reduce_front_back_sum_mean_ops.cu", - "caffe2/operators/reduce_ops.cu", - "caffe2/operators/reduction_ops.cu", - "caffe2/operators/relu_n_op.cu", - "caffe2/operators/relu_op.cu", - "caffe2/operators/replace_nan_op.cu", - "caffe2/operators/resize_3d_op.cu", - "caffe2/operators/resize_op.cu", - "caffe2/operators/reverse_packed_segs_op.cu", - "caffe2/operators/rmac_regions_op.cu", - "caffe2/operators/rnn/recurrent_network_op_gpu.cu", - "caffe2/operators/roi_align_gradient_op.cu", - "caffe2/operators/roi_align_op.cu", - "caffe2/operators/roi_align_rotated_gradient_op.cu", - "caffe2/operators/roi_align_rotated_op.cu", - "caffe2/operators/roi_pool_op.cu", - "caffe2/operators/rsqrt_op.cu", - "caffe2/operators/scale_blobs_op.cu", - "caffe2/operators/segment_reduction_op_gpu.cu", - "caffe2/operators/selu_op.cu", - "caffe2/operators/sequence_ops.cu", - "caffe2/operators/sigmoid_op.cu", - "caffe2/operators/sin_op.cu", - "caffe2/operators/sinh_op.cu", - "caffe2/operators/slice_op.cu", - "caffe2/operators/softmax_ops.cu", - "caffe2/operators/softplus_op.cu", - "caffe2/operators/softsign_op.cu", - "caffe2/operators/space_batch_op_gpu.cu", - "caffe2/operators/sparse_normalize_op_gpu.cu", - "caffe2/operators/sparse_to_dense_op.cu", - "caffe2/operators/spatial_batch_norm_op.cu", - "caffe2/operators/spatial_batch_norm_op_cudnn.cu", - "caffe2/operators/stump_func_op.cu", - "caffe2/operators/summarize_op.cu", - "caffe2/operators/swish_op.cu", - "caffe2/operators/tan_op.cu", - "caffe2/operators/tanh_op.cu", - "caffe2/operators/thresholded_relu_op.cu", - "caffe2/operators/tile_op.cu", - "caffe2/operators/top_k.cu", - "caffe2/operators/transpose_op.cu", - "caffe2/operators/unique_ops.cu", - "caffe2/operators/upsample_op.cu", - "caffe2/operators/utility_ops.cu", - "caffe2/operators/weighted_sample_op.cu", - "caffe2/sgd/adadelta_op_gpu.cu", - "caffe2/sgd/adagrad_op_gpu.cu", - "caffe2/sgd/adam_op_gpu.cu", - "caffe2/sgd/fp16_momentum_sgd_op.cu", - "caffe2/sgd/fp32_momentum_sgd_op.cu", - "caffe2/sgd/lars_op_gpu.cu", - "caffe2/sgd/momentum_sgd_op_gpu.cu", - "caffe2/sgd/rmsprop_op_gpu.cu", - "caffe2/sgd/yellowfin_op_gpu.cu", - "caffe2/utils/math/broadcast.cu", - "caffe2/utils/math/elementwise.cu", - "caffe2/utils/math/reduce.cu", - "caffe2/utils/math/transpose.cu", - "caffe2/utils/math_gpu.cu", - ], -) - # To achieve finer granularity and make debug easier, caffe2 is split into three libraries: # ATen, caffe2 and caffe2_for_aten_headers. ATen lib group up source codes under # aten/ directory and caffe2 contains most files under `caffe2/` directory. Since the @@ -1270,35 +581,10 @@ cc_library( ], ) -py_binary( - name = "gen_op", - srcs = ["caffe2/contrib/aten/gen_op.py"], - deps = ["//torchgen"], -) - -genrule( - name = "generated_caffe2_aten_op_headers", - srcs = [ - "caffe2/contrib/aten/aten_op_template.h", - "aten/src/ATen/Declarations.yaml", - ], - outs = ["caffe2/caffe2/contrib/aten/gen_aten_op.h"], - cmd = """ - $(location :gen_op) \ - --output_prefix gen_ \ - --install_dir $(@D) \ - --aten_root `dirname $(location aten/src/ATen/Declarations.yaml)`/../.. \ - --template_dir `dirname $(location caffe2/contrib/aten/aten_op_template.h)` \ - --yaml_dir `dirname $(location aten/src/ATen/Declarations.yaml)`""", - tools = [":gen_op"], -) - cc_library( name = "caffe2_headers", hdrs = glob( [ - "caffe2/contrib/aten/*.h", - "caffe2/contrib/gloo/*.h", "caffe2/core/*.h", "caffe2/core/nomnigraph/include/nomnigraph/Converters/*.h", "caffe2/core/nomnigraph/include/nomnigraph/Generated/*.h", @@ -1307,25 +593,8 @@ cc_library( "caffe2/core/nomnigraph/include/nomnigraph/Support/*.h", "caffe2/core/nomnigraph/include/nomnigraph/Transformations/*.h", "caffe2/core/nomnigraph/tests/*.h", - "caffe2/db/*.h", - "caffe2/distributed/*.h", - "caffe2/ideep/*.h", - "caffe2/ideep/operators/*.h", - "caffe2/ideep/operators/quantization/*.h", - "caffe2/ideep/utils/*.h", - "caffe2/onnx/*.h", - "caffe2/operators/*.h", - "caffe2/operators/rnn/*.h", - "caffe2/opt/*.h", "caffe2/perfkernels/*.h", - "caffe2/predictor/*.h", - "caffe2/predictor/emulator/*.h", - "caffe2/quantization/server/*.h", - "caffe2/queue/*.h", "caffe2/serialize/*.h", - "caffe2/sgd/*.h", - "caffe2/share/contrib/depthwise/*.h", - "caffe2/transforms/*.h", "caffe2/utils/*.h", "caffe2/utils/math/*.h", "caffe2/utils/threadpool/*.h", @@ -1337,10 +606,9 @@ cc_library( ) + if_cuda(glob([ "caffe2/**/*.cuh", "caffe2/image/*.h", - ])) + [":generated_caffe2_aten_op_headers"], + ])), copts = CAFFE2_COPTS, includes = [ - "caffe2/contrib/aten", "caffe2/core/nomnigraph/include", ], visibility = ["//visibility:public"], @@ -1352,52 +620,12 @@ cc_library( ], ) -cc_library( - name = "caffe2_dnnlowp_avx2_ops", - srcs = [ - "caffe2/quantization/server/elementwise_sum_dnnlowp_op_avx2.cc", - "caffe2/quantization/server/fully_connected_fake_lowp_op_avx2.cc", - "caffe2/quantization/server/group_norm_dnnlowp_op_avx2.cc", - "caffe2/quantization/server/norm_minimization_avx2.cc", - "caffe2/quantization/server/pool_dnnlowp_op_avx2.cc", - "caffe2/quantization/server/relu_dnnlowp_op_avx2.cc", - "caffe2/quantization/server/spatial_batch_norm_dnnlowp_op_avx2.cc", - "caffe2/quantization/server/transpose.cc", - ], - copts = CAFFE2_COPTS + [ - "-mf16c", - "-mavx2", - "-mfma", - "-mxsave", - ], - visibility = ["//visibility:public"], - deps = [ - ":caffe2_headers", - "@fbgemm", - ], - alwayslink = True, -) - cc_library( name = "caffe2", srcs = [ - "caffe2/db/create_db_op.cc", - "caffe2/db/protodb.cc", - "caffe2/share/contrib/depthwise/depthwise3x3_conv_op.cc", - ":caffe2_contrib_srcs", ":caffe2_core_srcs", - ":caffe2_distributed_srcs", - ":caffe2_ideep_srcs", - ":caffe2_onnx_srcs", - ":caffe2_operators_srcs", - ":caffe2_opt_srcs", ":caffe2_perfkernels_srcs", - ":caffe2_predictor_srcs", - ":caffe2_quantization_srcs", - ":caffe2_queue_srcs", ":caffe2_serialize_srcs", - ":caffe2_sgd_srcs", - ":caffe2_transforms_srcs", ":caffe2_utils_srcs", ], copts = CAFFE2_COPTS + ["-mf16c"], @@ -1405,7 +633,6 @@ cc_library( visibility = ["//visibility:public"], deps = [ ":caffe2_core_macros", - ":caffe2_dnnlowp_avx2_ops", ":caffe2_headers", ":caffe2_perfkernels_avx", ":caffe2_perfkernels_avx2", @@ -1418,11 +645,9 @@ cc_library( "@fbgemm//:fbgemm_src_headers", "@fmt", "@foxi", - "@gloo", "@onnx", ] + if_cuda( [ - ":caffe2_cuda_cpp", ":aten_cuda", "@tensorpipe//:tensorpipe_cuda", ], @@ -1434,39 +659,20 @@ cc_library( alwayslink = True, ) -cc_library( - name = "caffe2_cuda_cpp", - srcs = [":caffe2_cuda_cpp_srcs"], - copts = CAFFE2_COPTS, - visibility = ["//visibility:public"], - deps = [ - ":caffe2_cuda", - ":caffe2_headers", - ], - alwayslink = True, -) - cu_library( - name = "caffe2_cuda", - # one may think that `quantization_gpu.cu` could be a separate kernel, - # however that leads to de-registration problem that's described in - # https://github.com/pytorch/pytorch/issues/79236 - # To solve it we add it into the `caffe2_cuda`, - # this is also aligned with the CMake build. - srcs = [":caffe2_cu_srcs"] + [ + name = "torch_cuda", + srcs = [ "torch/csrc/distributed/c10d/intra_node_comm.cu", "torch/csrc/distributed/c10d/quantization/quantization_gpu.cu", ], - copts = CAFFE2_COPTS + torch_cuda_half_options, + copts = torch_cuda_half_options, visibility = ["//visibility:public"], deps = [ ":aten", - ":caffe2_headers", "@cuda//:cublas", "@cuda//:curand", "@cudnn", "@eigen", - "@gloo", "@tensorpipe//:tensorpipe_cuda", ], alwayslink = True, @@ -1640,6 +846,7 @@ cc_library( ] + if_cuda([ "@cuda//:nvToolsExt", "@cutlass", + ":torch_cuda", ]), alwayslink = True, ) @@ -1761,12 +968,10 @@ py_library( visibility = ["//visibility:public"], srcs = glob(["torch/**/*.py"], exclude = ["torch/version.py"]) + [":torch/version.py"] + glob(["functorch/**/*.py"]), deps = [ - rules.requirement("future"), rules.requirement("numpy"), rules.requirement("pyyaml"), rules.requirement("requests"), rules.requirement("setuptools"), - rules.requirement("six"), rules.requirement("sympy"), rules.requirement("typing_extensions"), "//torchgen", diff --git a/CITATION.cff b/CITATION.cff index 2bebc947bfb2f..e6de8772cbf21 100644 --- a/CITATION.cff +++ b/CITATION.cff @@ -6,68 +6,111 @@ authors: url: https://pytorch.org preferred-citation: type: conference-paper - title: "PyTorch: An Imperative Style, High-Performance Deep Learning Library" + title: "PyTorch 2: Faster Machine Learning Through Dynamic Python Bytecode Transformation and Graph Compilation" authors: - - family-names: Paszke - given-names: Adam - - family-names: Gross - given-names: Sam - - family-names: Massa - given-names: Francisco - - family-names: Lerer - given-names: Adam - - family-names: Bradbury - given-names: James - - family-names: Chanan - given-names: Gregory - - family-names: Killeen - given-names: Trevor - - family-names: Lin - given-names: Zeming + - family-names: Ansel + given-names: Jason + - family-names: Yang + given-names: Edward + - family-names: He + given-names: Horace - family-names: Gimelshein given-names: Natalia - - family-names: Antiga - given-names: Luca + - family-names: Jain + given-names: Animesh + - family-names: Voznesensky + given-names: Michael + - family-names: Bao + given-names: Bin + - family-names: Bell + given-names: Peter + - family-names: Berard + given-names: David + - family-names: Burovski + given-names: Evgeni + - family-names: Chauhan + given-names: Geeta + - family-names: Chourdia + given-names: Anjali + - family-names: Constable + given-names: Will - family-names: Desmaison given-names: Alban - - family-names: Kopf - given-names: Andreas - - family-names: Yang - given-names: Edward - family-names: DeVito given-names: Zachary - - family-names: Raison - given-names: Martin - - family-names: Tejani - given-names: Alykhan - - family-names: Chilamkurthy - given-names: Sasank - - family-names: Steiner - given-names: Benoit - - family-names: Fang - given-names: Lu - - family-names: Bai - given-names: Junjie + - family-names: Ellison + given-names: Elias + - family-names: Feng + given-names: Will + - family-names: Gong + given-names: Jiong + - family-names: Gschwind + given-names: Michael + - family-names: Hirsh + given-names: Brian + - family-names: Huang + given-names: Sherlock + - family-names: Kalambarkar + given-names: Kshiteej + - family-names: Kirsch + given-names: Laurent + - family-names: Lazos + given-names: Michael + - family-names: Lezcano + given-names: Mario + - family-names: Liang + given-names: Yanbo + - family-names: Liang + given-names: Jason + - family-names: Lu + given-names: Yinghai + - family-names: Luk + given-names: CK + - family-names: Maher + given-names: Bert + - family-names: Pan + given-names: Yunjie + - family-names: Puhrsch + given-names: Christian + - family-names: Reso + given-names: Matthias + - family-names: Saroufim + given-names: Mark + - family-names: Siraichi + given-names: Marcos Yukio + - family-names: Suk + given-names: Helen + - family-names: Suo + given-names: Michael + - family-names: Tillet + given-names: Phil + - family-names: Wang + given-names: Eikan + - family-names: Wang + given-names: Xiaodong + - family-names: Wen + given-names: William + - family-names: Zhang + given-names: Shunting + - family-names: Zhao + given-names: Xu + - family-names: Zhou + given-names: Keren + - family-names: Zou + given-names: Richard + - family-names: Mathews + given-names: Ajit + - family-names: Chanan + given-names: Gregory + - family-names: Wu + given-names: Peng - family-names: Chintala given-names: Soumith - collection-title: Advances in Neural Information Processing Systems 32 + collection-title: "29th ACM International Conference on Architectural Support for Programming Languages and Operating Systems, Volume 2 (ASPLOS '24)" collection-type: proceedings - editors: - - family-names: Wallach - given-names: H. - - family-names: Larochelle - given-names: H. - - family-names: Beygelzimer - given-names: A. - - family-names: "d'AlchƩ-Buc" - given-names: F. - - family-names: Fox - given-names: E. - - family-names: Garnett - given-names: R. - start: 8024 - end: 8035 - year: 2019 + month: 4 + year: 2024 publisher: - name: Curran Associates, Inc. - url: http://papers.neurips.cc/paper/9015-pytorch-an-imperative-style-high-performance-deep-learning-library.pdf + name: ACM + doi: "10.1145/3620665.3640366" + url: "https://pytorch.org/assets/pytorch2-2.pdf" diff --git a/CMakeLists.txt b/CMakeLists.txt index ba7c20e434fd6..79db67e7357b5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -19,6 +19,12 @@ cmake_policy(SET CMP0069 NEW) # nice when it's possible, and it's possible on our Windows configs. cmake_policy(SET CMP0092 NEW) +# Prohibit in-source builds +if(${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) +message(FATAL_ERROR "In-source build are not supported") +endif() + + # ---[ Project and semantic versioning. project(Torch CXX C) @@ -43,11 +49,15 @@ set(CMAKE_C_STANDARD 11 CACHE STRING "The C standard whose features are reques # ---[ Utils include(cmake/public/utils.cmake) -# --- [ Check that minimal gcc version is 9.4+ -if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.4) - message(FATAL "GCC-9.4 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}") +# --- [ Check that minimal gcc version is 9.3+ +if(CMAKE_COMPILER_IS_GNUCXX AND CMAKE_CXX_COMPILER_VERSION VERSION_LESS 9.3) + message(FATAL_ERROR "GCC-9.3 or newer is required to compile PyTorch, but found ${CMAKE_CXX_COMPILER_VERSION}") endif() +# This define is needed to preserve behavior given anticpated changes to cccl/thrust +# https://nvidia.github.io/libcudacxx/standard_api/numerics_library/complex.html +string(APPEND CMAKE_CUDA_FLAGS " -DLIBCUDACXX_ENABLE_SIMPLIFIED_COMPLEX_OPERATIONS") + if(LINUX) include(cmake/CheckAbi.cmake) string(APPEND CMAKE_CXX_FLAGS " -D_GLIBCXX_USE_CXX11_ABI=${GLIBCXX_USE_CXX11_ABI}") @@ -198,6 +208,9 @@ option(USE_COLORIZE_OUTPUT "Colorize output during compilation" ON) option(USE_ASAN "Use Address+Undefined Sanitizers" OFF) option(USE_TSAN "Use Thread Sanitizer" OFF) option(USE_CUDA "Use CUDA" ON) +cmake_dependent_option( + USE_XPU "Use XPU. Only available on Linux." ON + "LINUX" OFF) cmake_dependent_option( BUILD_LAZY_CUDA_LINALG "Build cuda linalg ops as separate library" ON "USE_CUDA AND LINUX AND BUILD_PYTHON" OFF) cmake_dependent_option(USE_ROCM "Use ROCm" ON "LINUX" OFF) @@ -215,14 +228,10 @@ option(USE_FBGEMM "Use FBGEMM (quantized 8-bit server operators)" ON) option(USE_KINETO "Use Kineto profiling library" ON) option(USE_CUPTI_SO "Use CUPTI as a shared library" ON) option(USE_FAKELOWP "Use FakeLowp operators" OFF) -option(USE_FFMPEG "Use ffmpeg" OFF) option(USE_GFLAGS "Use GFLAGS" OFF) option(USE_GLOG "Use GLOG" OFF) -option(USE_LEVELDB "Use LEVELDB" OFF) option(USE_LITE_PROTO "Use lite protobuf instead of full." OFF) -option(USE_LMDB "Use LMDB" OFF) option(USE_MAGMA "Use MAGMA" ON) -option(USE_METAL "Use Metal for Caffe2 iOS build" ON) option(USE_PYTORCH_METAL "Use Metal for PyTorch iOS build" OFF) option(USE_PYTORCH_METAL_EXPORT "Export Metal models on MacOSX desktop" OFF) option(USE_NATIVE_ARCH "Use -march=native" OFF) @@ -251,15 +260,12 @@ cmake_dependent_option( option(USE_NUMPY "Use NumPy" ON) option(USE_OBSERVERS "Use observers module." OFF) option(USE_OPENCL "Use OpenCL" OFF) -option(USE_OPENCV "Use OpenCV" OFF) option(USE_OPENMP "Use OpenMP for parallel code" ON) option(USE_PRECOMPILED_HEADERS "Use pre-compiled headers to accelerate build." OFF) option(USE_PROF "Use profiling" OFF) option(USE_QNNPACK "Use QNNPACK (quantized 8-bit operators)" ON) option(USE_PYTORCH_QNNPACK "Use ATen/QNNPACK (quantized 8-bit operators)" ON) -option(USE_REDIS "Use Redis" OFF) -option(USE_ROCKSDB "Use RocksDB" OFF) option(USE_SNPE "Use Qualcomm's SNPE library" OFF) option(USE_SYSTEM_EIGEN_INSTALL "Use system Eigen instead of the one under third_party" OFF) @@ -281,7 +287,6 @@ option(USE_VULKAN_FP16_INFERENCE "Vulkan - Use fp16 inference" OFF) option(USE_VULKAN_RELAXED_PRECISION "Vulkan - Use relaxed precision math in the kernels (mediump)" OFF) # option USE_XNNPACK: try to enable xnnpack by default. option(USE_XNNPACK "Use XNNPACK" ON) -option(USE_ZMQ "Use ZMQ" OFF) option(USE_ZSTD "Use ZSTD" OFF) option(USE_ROCM_KERNEL_ASSERT "Use Kernel Assert for ROCm" OFF) # Ensure that an ITT build is the default for x86 CPUs @@ -344,6 +349,8 @@ cmake_dependent_option( "NOT INTERN_BUILD_MOBILE" OFF) cmake_dependent_option( BUILD_FUNCTORCH "Build Functorch" ON "BUILD_PYTHON" OFF) +cmake_dependent_option( + BUILD_BUNDLE_PTXAS "Bundle PTX into torch/bin fodler" OFF "USE_CUDA" OFF) option(USE_MIMALLOC "Use mimalloc" OFF) # Enable third party mimalloc library to improve memory allocation performance on Windows. @@ -727,13 +734,34 @@ if(MSVC) append_cxx_flag_if_supported("/utf-8" CMAKE_CXX_FLAGS) endif() -# CAVEAT: do NOT check USE_ROCM here, because USE_ROCM is always True until -# include(cmake/Dependencies.cmake) +# Note for ROCM platform: +# 1. USE_ROCM is always ON until include(cmake/Dependencies.cmake) +# 2. USE_CUDA will become OFF during re-configuration +# Truth Table: +# CUDA 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default +# CUDA 2nd pass: USE_CUDA=True;USE_ROCM=False, FLASH evaluates to ON by default +# ROCM 1st pass: USE_CUDA=True;USE_ROCM=True, FLASH evaluates to ON by default +# ROCM 2nd pass: USE_CUDA=False;USE_ROCM=True, FLASH evaluates to ON by default +# CPU 1st pass: USE_CUDA=False(Cmd Option);USE_ROCM=True, FLASH evaluates to OFF by default +# CPU 2nd pass: USE_CUDA=False(Cmd Option);USE_ROCM=False, FLASH evaluates to OFF by default +# Thus we cannot tell ROCM 2nd pass and CPU 1st pass +# +# The only solution is to include(cmake/Dependencies.cmake), and defer the +# aotriton build decision later. + +include(cmake/Dependencies.cmake) + cmake_dependent_option( USE_FLASH_ATTENTION "Whether to build the flash_attention kernel for scaled dot product attention.\ Will be disabled if not supported by the platform" ON - "USE_CUDA AND NOT MSVC" OFF) + "USE_CUDA OR USE_ROCM;NOT MSVC" OFF) + +# We are currenlty not using alibi attention for Flash +# So we disable this feature by default +# We dont currently document this feature because we don't +# Suspect users building from source will need this +add_definitions(-DFLASHATTENTION_DISABLE_ALIBI) # CAVEAT: Again, do not check USE_ROCM here # Flash Attention2 will error while building for sm52 while Mem Eff Attention won't @@ -743,8 +771,6 @@ cmake_dependent_option( Will be disabled if not supported by the platform" ON "USE_CUDA" OFF) -include(cmake/Dependencies.cmake) - if(DEBUG_CUDA) string(APPEND CMAKE_CUDA_FLAGS_DEBUG " -lineinfo") string(APPEND CMAKE_CUDA_FLAGS_RELWITHDEBINFO " -lineinfo") @@ -1146,6 +1172,7 @@ if(BUILD_SHARED_LIBS) COMPONENT dev) install(FILES ${PROJECT_SOURCE_DIR}/cmake/public/cuda.cmake + ${PROJECT_SOURCE_DIR}/cmake/public/xpu.cmake ${PROJECT_SOURCE_DIR}/cmake/public/glog.cmake ${PROJECT_SOURCE_DIR}/cmake/public/gflags.cmake ${PROJECT_SOURCE_DIR}/cmake/public/mkl.cmake @@ -1167,6 +1194,10 @@ if(BUILD_SHARED_LIBS) ${PROJECT_SOURCE_DIR}/cmake/Modules/FindCUSPARSELT.cmake DESTINATION share/cmake/Caffe2/ COMPONENT dev) + install(FILES + ${PROJECT_SOURCE_DIR}/cmake/Modules/FindSYCLToolkit.cmake + DESTINATION share/cmake/Caffe2/ + COMPONENT dev) install(EXPORT Caffe2Targets DESTINATION share/cmake/Caffe2 FILE Caffe2Targets.cmake @@ -1229,3 +1260,12 @@ if(DEFINED USE_CUSTOM_DEBINFO) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -g") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -g") endif() + +# Bundle PTXAS if needed +if(BUILD_BUNDLE_PTXAS AND USE_CUDA) + if(NOT EXISTS "${PROJECT_SOURCE_DIR}/build/bin/ptxas") + message(STATUS "Copying PTXAS into the bin folder") + file(COPY "${CUDAToolkit_BIN_DIR}/ptxas" DESTINATION "${PROJECT_BINARY_DIR}") + endif() + install(PROGRAMS "${PROJECT_BINARY_DIR}/ptxas" DESTINATION "${CMAKE_INSTALL_BINDIR}") +endif() diff --git a/CODEOWNERS b/CODEOWNERS index a07f5f81a2a51..e481e6611279a 100644 --- a/CODEOWNERS +++ b/CODEOWNERS @@ -43,12 +43,12 @@ nn/qat/ @jerryzh168 /torch/csrc/distributed/rpc/tensorpipe_agent.h @jiayisuse @osalpekar @lw # ONNX Export -/torch/_dynamo/backends/onnxrt.py @bowenbao @abock @thiagocrepaldi @wschin -/torch/csrc/jit/passes/onnx.h @bowenbao @abock @thiagocrepaldi -/torch/csrc/jit/passes/onnx.cpp @bowenbao @abock @thiagocrepaldi -/torch/csrc/jit/passes/onnx/ @bowenbao @abock @thiagocrepaldi -/torch/onnx/ @bowenbao @abock @thiagocrepaldi @wschin -/test/onnx/ @bowenbao @abock @thiagocrepaldi @wschin +/torch/_dynamo/backends/onnxrt.py @bowenbao @thiagocrepaldi @wschin +/torch/csrc/jit/passes/onnx.h @bowenbao @thiagocrepaldi +/torch/csrc/jit/passes/onnx.cpp @bowenbao @thiagocrepaldi +/torch/csrc/jit/passes/onnx/ @bowenbao @thiagocrepaldi +/torch/onnx/ @bowenbao @thiagocrepaldi @wschin +/test/onnx/ @bowenbao @thiagocrepaldi @wschin # CI /.ci @pytorch/pytorch-dev-infra @@ -67,6 +67,7 @@ nn/qat/ @jerryzh168 /test/run_test.py @pytorch/pytorch-dev-infra /torch/testing/_internal/common_device_type.py @mruberry /torch/testing/_internal/common_utils.py @pytorch/pytorch-dev-infra +/torch/testing/_internal/hop_db.py @tugsbayasgalan @zou3519 @ydwu4 # Parametrizations /torch/nn/utils/parametriz*.py @lezcano @@ -97,9 +98,13 @@ test/functorch/test_ops.py @zou3519 @chillee @kshitij12345 test/functorch/test_vmap.py @zou3519 @chillee @kshitij12345 # torch MPS -test/test_mps.py @kulinseth -aten/src/ATen/mps/ @kulinseth -aten/src/ATen/native/mps/ @kulinseth +test/test_mps.py @kulinseth @malfet +aten/src/ATen/mps/ @kulinseth @malfet +aten/src/ATen/native/mps/ @kulinseth @malfet + +# MTIA +aten/src/ATen/detail/MTIAHooksInterface.h @egienvalue +torch/csrc/mtia/ @egienvalue # Profiler torch/csrc/autograd/profiler* @aaronenyeshi @@ -111,7 +116,7 @@ torch/profiler/ @aaronenyeshi test/functorch/test_aotdispatch.py @ezyang @Chillee # Dataloader -torch/utils/data/ @ejguan +torch/utils/data/ @andrewkho @gokulavasan # hipify torch/utils/hipify/ @jeffdaily @jithunnair-amd @@ -130,3 +135,23 @@ caffe2/utils/hip @jeffdaily @jithunnair-amd # torch.export /torch/export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17 /torch/_export/ @avikchaudhuri @gmagogsfm @tugsbayasgalan @zhxchen17 + +# serialization-related files +/aten/src/ATen/MapAllocator* @mikaylagawarecki +/caffe2/serialize/ @mikaylagawarecki +/torch/serialization.py @mikaylagawarecki +/torch/storage.py @mikaylagawarecki +/torch/csrc/Storage* @mikaylagawarecki +# subscribing for PyTorchFileWriter/PyTorchFileReader changes +/torch/csrc/jit/python/init.cpp @mikaylagawarecki + +# CUDA and CUDA math libraries +aten/src/ATen/cuda/ @eqy +aten/src/ATen/cudnn/ @eqy +aten/src/ATen/native/cuda/ @eqy +aten/src/ATen/native/cudnn/ @eqy +c10/cuda @eqy +torch/cuda/ @eqy +torch/csrc/cuda/ @eqy +torch/backends/cuda/ @eqy +torch/backends/cudnn/ @eqy diff --git a/Dockerfile b/Dockerfile index a2c4bef96598c..b8ff65fdd1e9a 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,14 +1,12 @@ -# syntax = docker/dockerfile:experimental -# -# NOTE: To build this you will need a docker version > 18.06 with -# experimental enabled and DOCKER_BUILDKIT=1 -# -# If you do not use buildkit you are not going to have a good time +# syntax=docker/dockerfile:1 + +# NOTE: Building this image require's docker version >= 23.0. # -# For reference: -# https://docs.docker.com/develop/develop-images/build_enhancements/ -ARG BASE_IMAGE=ubuntu:20.04 -ARG PYTHON_VERSION=3.8 +# For reference: +# - https://docs.docker.com/build/dockerfile/frontend/#stable-channel + +ARG BASE_IMAGE=ubuntu:22.04 +ARG PYTHON_VERSION=3.11 FROM ${BASE_IMAGE} as dev-base RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends \ @@ -26,7 +24,7 @@ RUN mkdir /opt/ccache && ccache --set-config=cache_dir=/opt/ccache ENV PATH /opt/conda/bin:$PATH FROM dev-base as conda -ARG PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.11 # Automatically set by buildx ARG TARGETPLATFORM # translating Docker's TARGETPLATFORM into miniconda arches @@ -57,12 +55,12 @@ COPY --from=submodule-update /opt/pytorch /opt/pytorch RUN make triton RUN --mount=type=cache,target=/opt/ccache \ export eval ${CMAKE_VARS} && \ - TORCH_CUDA_ARCH_LIST="3.5 5.2 6.0 6.1 7.0+PTX 8.0" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ + TORCH_CUDA_ARCH_LIST="7.0 7.2 7.5 8.0 8.6 8.7 8.9 9.0 9.0a" TORCH_NVCC_FLAGS="-Xfatbin -compress-all" \ CMAKE_PREFIX_PATH="$(dirname $(which conda))/../" \ python setup.py install FROM conda as conda-installs -ARG PYTHON_VERSION=3.8 +ARG PYTHON_VERSION=3.11 ARG CUDA_VERSION=12.1 ARG CUDA_CHANNEL=nvidia ARG INSTALL_CHANNEL=pytorch-nightly @@ -99,6 +97,7 @@ ENV PATH /opt/conda/bin:$PATH ENV NVIDIA_VISIBLE_DEVICES all ENV NVIDIA_DRIVER_CAPABILITIES compute,utility ENV LD_LIBRARY_PATH /usr/local/nvidia/lib:/usr/local/nvidia/lib64 +ENV PATH /usr/local/nvidia/bin:/usr/local/cuda/bin:$PATH ENV PYTORCH_VERSION ${PYTORCH_VERSION} WORKDIR /workspace diff --git a/README.md b/README.md index ae3e1330c02e9..3ff42586109c3 100644 --- a/README.md +++ b/README.md @@ -158,16 +158,16 @@ They require JetPack 4.2 and above, and [@dusty-nv](https://github.com/dusty-nv) #### Prerequisites If you are installing from source, you will need: - Python 3.8 or later (for Linux, Python 3.8.1+ is needed) -- A compiler that fully supports C++17, such as clang or gcc (especially for aarch64, gcc 9.4.0 or newer is required) +- A compiler that fully supports C++17, such as clang or gcc (gcc 9.4.0 or newer is required) We highly recommend installing an [Anaconda](https://www.anaconda.com/download) environment. You will get a high-quality BLAS library (MKL) and you get controlled dependency versions regardless of your Linux distro. If you want to compile with CUDA support, [select a supported version of CUDA from our support matrix](https://pytorch.org/get-started/locally/), then install the following: - [NVIDIA CUDA](https://developer.nvidia.com/cuda-downloads) -- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v7 or above +- [NVIDIA cuDNN](https://developer.nvidia.com/cudnn) v8.5 or above - [Compiler](https://gist.github.com/ax3l/9489132) compatible with CUDA -Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/pdf/cuDNN-Support-Matrix.pdf) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware +Note: You could refer to the [cuDNN Support Matrix](https://docs.nvidia.com/deeplearning/cudnn/reference/support-matrix.html) for cuDNN versions with the various supported CUDA, CUDA driver and NVIDIA hardware If you want to disable CUDA support, export the environment variable `USE_CUDA=0`. Other potentially useful environment variables may be found in `setup.py`. diff --git a/RELEASE.md b/RELEASE.md index aeb98825c592b..cfb1a089a02fe 100644 --- a/RELEASE.md +++ b/RELEASE.md @@ -17,6 +17,7 @@ - [Release Candidate Storage](#release-candidate-storage) - [Release Candidate health validation](#release-candidate-health-validation) - [Cherry Picking Fixes](#cherry-picking-fixes) + - [How to do Cherry Picking](#how-to-do-cherry-picking) - [Cherry Picking Reverts](#cherry-picking-reverts) - [Preparing and Creating Final Release candidate](#preparing-and-creating-final-release-candidate) - [Promoting RCs to Stable](#promoting-rcs-to-stable) @@ -49,7 +50,8 @@ Following is the Release Compatibility Matrix for PyTorch releases: | PyTorch version | Python | Stable CUDA | Experimental CUDA | | --- | --- | --- | --- | -| 2.2 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | +| 2.3 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | +| 2.2 | >=3.8, <=3.11, (3.12 experimental) | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | | 2.1 | >=3.8, <=3.11 | CUDA 11.8, CUDNN 8.7.0.84 | CUDA 12.1, CUDNN 8.9.2.26 | | 2.0 | >=3.8, <=3.11 | CUDA 11.7, CUDNN 8.5.0.96 | CUDA 11.8, CUDNN 8.7.0.84 | | 1.13 | >=3.7, <=3.10 | CUDA 11.6, CUDNN 8.3.2.44 | CUDA 11.7, CUDNN 8.5.0.96 | @@ -217,7 +219,7 @@ Validate the release jobs for pytorch and domain libraries should be green. Vali * [TorchVision](https://hud.pytorch.org/hud/pytorch/vision/release%2F1.12) * [TorchAudio](https://hud.pytorch.org/hud/pytorch/audio/release%2F1.12) -Validate that the documentation build has completed and generated entry corresponding to the release in [docs folder](https://github.com/pytorch/pytorch.github.io/tree/site/docs/) of pytorch.github.io repository +Validate that the documentation build has completed and generated entry corresponding to the release in [docs repository](https://github.com/pytorch/docs/tree/main/). ### Cherry Picking Fixes @@ -232,6 +234,32 @@ Please also make sure to add milestone target to the PR/issue, especially if it **NOTE**: The cherry pick process is not an invitation to add new features, it is mainly there to fix regressions +#### How to do Cherry Picking + +You can now use `pytorchbot` to cherry pick a PyTorch PR that has been committed +to the main branch using `@pytorchbot cherry-pick` command as follows. + +``` +usage: @pytorchbot cherry-pick --onto ONTO [--fixes FIXES] -c + {regression,critical,fixnewfeature,docs,release} + +Cherry pick a pull request onto a release branch for inclusion in a release + +optional arguments: + --onto ONTO Branch you would like to cherry pick onto (Example: release/2.2) + --fixes FIXES Link to the issue that your PR fixes (i.e. https://github.com/pytorch/pytorch/issues/110666) + -c {regression,critical,fixnewfeature,docs,release} + A machine-friendly classification of the cherry-pick reason. +``` + +For example, [#120567](https://github.com/pytorch/pytorch/pull/120567#issuecomment-1978964376) +created a cherry pick PR [#121232](https://github.com/pytorch/pytorch/pull/121232) onto `release/2.2` +branch to fix a regression issue. You can then refer to the original +and the cherry-picked PRs on the release tracker issue. Please note +that the cherry-picked PR will still need to be reviewed by PyTorch +RelEng team before it can go into the release branch. This feature +requires `pytorchbot`, so it's only available in PyTorch atm. + ### Cherry Picking Reverts If PR that has been cherry-picked into release branch has been reverted, it's cherry-pick must be reverted as well. diff --git a/SECURITY.md b/SECURITY.md index 0651f82b70c6e..e8e0249fc896c 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -1,9 +1,56 @@ -# Reporting Security Issues +# Security Policy -If you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. + - [**Reporting a Vulnerability**](#reporting-a-vulnerability) + - [**Using Pytorch Securely**](#using-pytorch-securely) + - [Untrusted models](#untrusted-models) + - [Untrusted inputs](#untrusted-inputs) + - [Data privacy](#data-privacy) + +## Reporting Security Issues + +Beware that none of the topics under [Using Pytorch Securely](#using-pytorch-securely) are considered vulnerabilities of Pytorch. + +However, if you believe you have found a security vulnerability in PyTorch, we encourage you to let us know right away. We will investigate all legitimate reports and do our best to quickly fix the problem. Please report security issues using https://github.com/pytorch/pytorch/security/advisories/new Please refer to the following page for our responsible disclosure policy, reward guidelines, and those things that should not be reported: https://www.facebook.com/whitehat + + +## Using Pytorch Securely +**Pytorch models are programs**, so treat its security seriously -- running untrusted models is equivalent to running untrusted code. In general we recommend that model weights and the python code for the model are distributed independently. That said, be careful about where you get the python code from and who wrote it (preferentially check for a provenance or checksums, do not run any pip installed package). + +### Untrusted models +Be careful when running untrusted models. This classification includes models created by unknown developers or utilizing data obtained from unknown sources[^data-poisoning-sources]. + +**Prefer to execute untrusted models within a secure, isolated environment such as a sandbox** (e.g., containers, virtual machines). This helps protect your system from potentially malicious code. You can find further details and instructions in [this page](https://developers.google.com/code-sandboxing). + +**Be mindful of risky model formats**. Give preference to share and load weights with the appropriate format for your use case. [safetensors](https://huggingface.co/docs/safetensors/en/index) gives the most safety but is the most restricted in what it supports. [`torch.load`](https://pytorch.org/docs/stable/generated/torch.load.html#torch.load) with `weights_only=True` is also secure to our knowledge even though it offers significantly larger surface of attack. Loading un-trusted checkpoint with `weights_only=False` MUST never be done. + + + +Important Note: The trustworthiness of a model is not binary. You must always determine the proper level of caution depending on the specific model and how it matches your use case and risk tolerance. + +[^data-poisoning-sources]: To understand risks of utilization of data from unknown sources, read the following Cornell papers on Data poisoning: + https://arxiv.org/abs/2312.04748 + https://arxiv.org/abs/2401.05566 + +### Untrusted inputs during training and prediction + +If you plan to open your model to untrusted inputs, be aware that inputs can also be used as vectors by malicious agents. To minimize risks, make sure to give your model only the permisisons strictly required, and keep your libraries updated with the lates security patches. + +If applicable, prepare your model against bad inputs and prompt injections. Some recommendations: +- Pre-analysis: check how the model performs by default when exposed to prompt injection (e.g. using fuzzing for prompt injection). +- Input Sanitation: Before feeding data to the model, sanitize inputs rigorously. This involves techniques such as: + - Validation: Enforce strict rules on allowed characters and data types. + - Filtering: Remove potentially malicious scripts or code fragments. + - Encoding: Convert special characters into safe representations. + - Verification: Run tooling that identifies potential script injections (e.g. [models that detect prompt injection attempts](https://python.langchain.com/docs/guides/safety/hugging_face_prompt_injection)). + +### Data privacy + +**Take special security measures if your model if you train models with sensitive data**. Prioritize [sandboxing](https://developers.google.com/code-sandboxing) your models and: +- Do not feed sensitive data to untrusted model (even if runs in a sandboxed environment) +- If you consider publishing a model that was partially trained with sensitive data, be aware that data can potentially be recovered from the trained weights (especially if model overfits). diff --git a/WORKSPACE b/WORKSPACE index b187949d663e7..8eabea571a571 100644 --- a/WORKSPACE +++ b/WORKSPACE @@ -71,6 +71,13 @@ http_archive( ], ) +http_archive( + name = "com_github_opentelemetry-cpp", + urls = [ + "https://github.com/open-telemetry/opentelemetry-cpp/archive/refs/tags/v1.14.2.tar.gz", + ], +) + new_local_repository( name = "gloo", build_file = "//third_party:gloo.BUILD", @@ -155,6 +162,12 @@ new_local_repository( path = "third_party/kineto", ) +new_local_repository( + name = "opentelemetry-cpp", + build_file = "//third_party::opentelemetry-cpp.BUILD", + path = "third_party/opentelemetry-cpp", +) + new_patched_local_repository( name = "tbb", build_file = "//third_party:tbb.BUILD", @@ -206,11 +219,11 @@ py_repositories() load("@rules_python//python:repositories.bzl", "python_register_toolchains") python_register_toolchains( - name = "python3_8", - python_version = "3.8", + name = "python3_10", + python_version = "3.10", ) -load("@python3_8//:defs.bzl", "interpreter") +load("@python3_10//:defs.bzl", "interpreter") load("@rules_python//python:pip.bzl", "pip_parse") pip_parse( diff --git a/android/README.md b/android/README.md index e13344aebe52b..d6a1ba1d4479b 100644 --- a/android/README.md +++ b/android/README.md @@ -9,7 +9,7 @@ Demo applications with code walk-through can be find in [this github repo](https ##### Release Release artifacts are published to jcenter: -``` +```groovy repositories { jcenter() } @@ -32,7 +32,7 @@ dependencies { Nightly(snapshots) builds are published every night from `master` branch to [nexus sonatype snapshots repository](https://oss.sonatype.org/#nexus-search;quick~pytorch_android) To use them repository must be specified explicitly: -``` +```groovy repositories { maven { url "https://oss.sonatype.org/content/repositories/snapshots" @@ -62,7 +62,7 @@ The current nightly(snapshots) version is the value of `VERSION_NAME` in `gradle In some cases you might want to use a local build of pytorch android, for example you may build custom libtorch binary with another set of operators or to make local changes. For this you can use `./scripts/build_pytorch_android.sh` script. -``` +```bash git clone https://github.com/pytorch/pytorch.git cd pytorch git submodule update --init --recursive @@ -91,14 +91,14 @@ They are specified as environment variables: After successful build you should see the result as aar file: -``` +```bash $ find pytorch_android/build/ -type f -name *aar pytorch_android/build/outputs/aar/pytorch_android.aar pytorch_android_torchvision/build/outputs/aar/pytorch_android.aar ``` It can be used directly in android projects, as a gradle dependency: -``` +```groovy allprojects { repositories { flatDir { @@ -126,7 +126,7 @@ You can check out [test app example](https://github.com/pytorch/pytorch/blob/mas In some cases, you may want to use libtorch from your android native build. You can do it without building libtorch android, using native libraries from PyTorch android gradle dependency. For that, you will need to add the next lines to your gradle build. -``` +```groovy android { ... configurations { @@ -181,7 +181,7 @@ The added task will unpack them to gradle build directory. In your native build you can link to them adding these lines to your CMakeLists.txt: -``` +```cmake # Relative path of gradle build directory to CMakeLists.txt set(build_DIR ${CMAKE_SOURCE_DIR}/build) @@ -209,7 +209,7 @@ target_link_libraries(${PROJECT_NAME} If your CMakeLists.txt file is located in the same directory as your build.gradle, `set(build_DIR ${CMAKE_SOURCE_DIR}/build)` should work for you. But if you have another location of it, you may need to change it. After that, you can use libtorch C++ API from your native code. -``` +```cpp #include #include #include diff --git a/android/pytorch_android/generate_test_torchscripts.py b/android/pytorch_android/generate_test_torchscripts.py index c3c9518517ae7..a487bd1242e0c 100644 --- a/android/pytorch_android/generate_test_torchscripts.py +++ b/android/pytorch_android/generate_test_torchscripts.py @@ -125,6 +125,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor: r = r.contiguous() return r + @torch.jit.script_method + def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor: + r = torch.nn.functional.conv3d(x, w) + if toChannelsLast: + r = r.contiguous(memory_format=torch.channels_last_3d) + else: + r = r.contiguous() + return r + @torch.jit.script_method def contiguous(self, x: Tensor) -> Tensor: return x.contiguous() diff --git a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl index df62dd8620881..9adfb84bf8551 100644 Binary files a/android/pytorch_android/src/androidTest/assets/android_api_module.ptl and b/android/pytorch_android/src/androidTest/assets/android_api_module.ptl differ diff --git a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java index d2dfa93da17a3..7980a34c04347 100644 --- a/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java +++ b/android/pytorch_android/src/androidTest/java/org/pytorch/PytorchTestBase.java @@ -348,15 +348,32 @@ public void testChannelsLast3d() throws IOException { @Test public void testChannelsLastConv2d() throws IOException { long[] inputShape = new long[] {1, 3, 2, 2}; - long[] dataNCHW = new long[] {1, 2, 3, 4, 11, 12, 13, 14, 101, 102, 103, 104}; + long[] dataNCHW = new long[] { + 111, 112, + 121, 122, + + 211, 212, + 221, 222, + + 311, 312, + 321, 322}; Tensor inputNCHW = Tensor.fromBlob(dataNCHW, inputShape, MemoryFormat.CONTIGUOUS); - long[] dataNHWC = new long[] {1, 11, 101, 2, 12, 102, 3, 13, 103, 4, 14, 104}; - Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST); + long[] dataNHWC = new long[] { + 111, 211, 311, 112, 212, 312, + 121, 221, 321, 122, 222, 322}; + Tensor inputNHWC = Tensor.fromBlob(dataNHWC, inputShape, MemoryFormat.CHANNELS_LAST); long[] weightShape = new long[] {3, 3, 1, 1}; - long[] dataWeightOIHW = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1}; + long[] dataWeightOIHW = new long[] { + 2, 0, 0, + 0, 1, 0, + 0, 0, -1}; Tensor wNCHW = Tensor.fromBlob(dataWeightOIHW, weightShape, MemoryFormat.CONTIGUOUS); - long[] dataWeightOHWI = new long[] {2, 0, 0, 0, 1, 0, 0, 0, -1}; + long[] dataWeightOHWI = new long[] { + 2, 0, 0, + 0, 1, 0, + 0, 0, -1}; + Tensor wNHWC = Tensor.fromBlob(dataWeightOHWI, weightShape, MemoryFormat.CHANNELS_LAST); final Module module = loadModel(TEST_MODULE_ASSET_NAME); @@ -367,7 +384,15 @@ public void testChannelsLastConv2d() throws IOException { outputNCHW, MemoryFormat.CONTIGUOUS, new long[] {1, 3, 2, 2}, - new long[] {2, 4, 6, 8, 11, 12, 13, 14, -101, -102, -103, -104}); + new long[] { + 2*111, 2*112, + 2*121, 2*122, + + 211, 212, + 221, 222, + + -311, -312, + -321, -322}); final IValue outputNHWC = module.runMethod("conv2d", IValue.from(inputNHWC), IValue.from(wNHWC), IValue.from(true)); @@ -375,7 +400,89 @@ public void testChannelsLastConv2d() throws IOException { outputNHWC, MemoryFormat.CHANNELS_LAST, new long[] {1, 3, 2, 2}, - new long[] {2, 11, -101, 4, 12, -102, 6, 13, -103, 8, 14, -104}); + new long[] { + 2*111, 211, -311, 2*112, 212, -312, + 2*121, 221, -321, 2*122, 222, -322}); + } + + @Test + public void testChannelsLastConv3d() throws IOException { + long[] inputShape = new long[] {1, 3, 2, 2, 2}; + long[] dataNCDHW = new long[] { + 1111, 1112, + 1121, 1122, + 1211, 1212, + 1221, 1222, + + 2111, 2112, + 2121, 2122, + 2211, 2212, + 2221, 2222, + + 3111, 3112, + 3121, 3122, + 3211, 3212, + 3221, 3222}; + Tensor inputNCDHW = Tensor.fromBlob(dataNCDHW, inputShape, MemoryFormat.CONTIGUOUS); + long[] dataNDHWC = new long[] { + 1111, 2111, 3111, + 1112, 2112, 3112, + + 1121, 2121, 3121, + 1122, 2122, 3122, + + 1211, 2211, 3211, + 1212, 2212, 3212, + + 1221, 2221, 3221, + 1222, 2222, 3222}; + + Tensor inputNDHWC = Tensor.fromBlob(dataNDHWC, inputShape, MemoryFormat.CHANNELS_LAST_3D); + + long[] weightShape = new long[] {3, 3, 1, 1, 1}; + long[] dataWeightOIDHW = new long[] { + 2, 0, 0, + 0, 1, 0, + 0, 0, -1, + }; + Tensor wNCDHW = Tensor.fromBlob(dataWeightOIDHW, weightShape, MemoryFormat.CONTIGUOUS); + long[] dataWeightODHWI = new long[] { + 2, 0, 0, + 0, 1, 0, + 0, 0, -1, + }; + Tensor wNDHWC = Tensor.fromBlob(dataWeightODHWI, weightShape, MemoryFormat.CHANNELS_LAST_3D); + + final Module module = loadModel(TEST_MODULE_ASSET_NAME); + + final IValue outputNCDHW = + module.runMethod("conv3d", IValue.from(inputNCDHW), IValue.from(wNCDHW), IValue.from(false)); + assertIValueTensor( + outputNCDHW, + MemoryFormat.CONTIGUOUS, + new long[] {1, 3, 2, 2, 2}, + new long[] { + 2*1111, 2*1112, 2*1121, 2*1122, + 2*1211, 2*1212, 2*1221, 2*1222, + + 2111, 2112, 2121, 2122, + 2211, 2212, 2221, 2222, + + -3111, -3112, -3121, -3122, + -3211, -3212, -3221, -3222}); + + final IValue outputNDHWC = + module.runMethod("conv3d", IValue.from(inputNDHWC), IValue.from(wNDHWC), IValue.from(true)); + assertIValueTensor( + outputNDHWC, + MemoryFormat.CHANNELS_LAST_3D, + new long[] {1, 3, 2, 2, 2}, + new long[] { + 2*1111, 2111, -3111, 2*1112, 2112, -3112, + 2*1121, 2121, -3121, 2*1122, 2122, -3122, + + 2*1211, 2211, -3211, 2*1212, 2212, -3212, + 2*1221, 2221, -3221, 2*1222, 2222, -3222}); } @Test diff --git a/android/pytorch_android/test_asset.jit b/android/pytorch_android/test_asset.jit index 3bd9037da4ee6..8605ab13d555e 100644 --- a/android/pytorch_android/test_asset.jit +++ b/android/pytorch_android/test_asset.jit @@ -84,6 +84,15 @@ def conv2d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor: r = r.contiguous() return r +def conv3d(self, x: Tensor, w: Tensor, toChannelsLast: bool) -> Tensor: + r = torch.conv3d(x, w) + if (toChannelsLast): + # memory_format=torch.channels_last_3d + r = r.contiguous(memory_format=2) + else: + r = r.contiguous() + return r + def contiguous(self, x: Tensor) -> Tensor: return x.contiguous() diff --git a/aten.bzl b/aten.bzl index 9c6325d16abfb..6d8cff6d1ae7d 100644 --- a/aten.bzl +++ b/aten.bzl @@ -62,11 +62,10 @@ def generate_aten_impl(ctx): outputs = [ops_dir] + ctx.outputs.outs install_dir = paths.dirname(ops_dir.path) - tool_inputs, tool_inputs_manifest = ctx.resolve_tools(tools = [ctx.attr.generator]) - ctx.actions.run_shell( + ctx.actions.run( outputs = outputs, inputs = ctx.files.srcs, - command = ctx.executable.generator.path + " $@", + executable = ctx.executable.generator, arguments = [ "--source-path", "aten/src/ATen", @@ -74,8 +73,6 @@ def generate_aten_impl(ctx): "--install_dir", install_dir, ], - tools = tool_inputs, - input_manifests = tool_inputs_manifest, use_default_shell_env = True, mnemonic = "GenerateAten", ) diff --git a/aten/CMakeLists.txt b/aten/CMakeLists.txt index 412b2a603231d..bda6aea327062 100644 --- a/aten/CMakeLists.txt +++ b/aten/CMakeLists.txt @@ -18,6 +18,8 @@ cmake_policy(SET CMP0012 NEW) ############################################# set(ATen_CPU_SRCS) +set(ATen_XPU_SRCS) +set(ATen_XPU_INCLUDE) set(ATen_CPU_TEST_SRCS) set(ATen_CPU_INCLUDE) set(ATen_THIRD_PARTY_INCLUDE) @@ -34,8 +36,12 @@ set(ATen_HIP_TEST_SRCS) set(ATen_HIP_INCLUDE) set(ATen_MPS_SRCS) set(ATen_MPS_TEST_SRCS) +set(ATen_XPU_SRCS) +set(ATen_XPU_INCLUDE) +set(ATen_XPU_TEST_SRCS) set(ATen_VULKAN_TEST_SRCS) set(ATen_CPU_DEPENDENCY_LIBS) +set(ATen_XPU_DEPENDENCY_LIBS) set(ATen_CUDA_DEPENDENCY_LIBS) set(ATen_HIP_DEPENDENCY_LIBS) set(ATen_PUBLIC_CUDA_DEPENDENCY_LIBS) @@ -102,6 +108,8 @@ add_subdirectory(src/ATen) # Pass source, includes, and libs to parent set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) +set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) +set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE) @@ -111,6 +119,8 @@ set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE) set(ATen_MPS_TEST_SRCS ${ATen_MPS_TEST_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS_W_SORT_BY_KEY ${ATen_HIP_SRCS_W_SORT_BY_KEY} PARENT_SCOPE) +set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) +set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE) set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) @@ -122,8 +132,10 @@ set(ATen_VEC_TEST_SRCS ${ATen_VEC_TEST_SRCS} PARENT_SCOPE) set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE) +set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) diff --git a/aten/src/ATen/AccumulateType.h b/aten/src/ATen/AccumulateType.h index f96f34e1e6b6d..0275ef099b03d 100644 --- a/aten/src/ATen/AccumulateType.h +++ b/aten/src/ATen/AccumulateType.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include // Defines the accumulation type for a scalar type. @@ -87,6 +89,8 @@ MPS_ACC_TYPE(BFloat16, float); MPS_ACC_TYPE(Half, float); MPS_ACC_TYPE(Float8_e5m2, float); MPS_ACC_TYPE(Float8_e4m3fn, float); +MPS_ACC_TYPE(Float8_e5m2fnuz, float); +MPS_ACC_TYPE(Float8_e4m3fnuz, float); MPS_ACC_TYPE(float, float); MPS_ACC_TYPE(double, float); MPS_ACC_TYPE(int8_t, int64_t); @@ -107,6 +111,8 @@ CUDA_ACC_TYPE(BFloat16, float); CUDA_ACC_TYPE(Half, float); CUDA_ACC_TYPE(Float8_e5m2, float); CUDA_ACC_TYPE(Float8_e4m3fn, float); +CUDA_ACC_TYPE(Float8_e5m2fnuz, float); +CUDA_ACC_TYPE(Float8_e4m3fnuz, float); CUDA_ACC_TYPE(float, float); CUDA_ACC_TYPE(double, double); CUDA_ACC_TYPE(int8_t, int64_t); @@ -123,8 +129,8 @@ CUDA_ACC_TYPE(c10::complex, c10::complex); CPU_ACC_TYPE(BFloat16, float); CPU_ACC_TYPE(Half, float); CPU_ACC_TYPE(Float8_e5m2, float); -CPU_ACC_TYPE(Float8_e5m2fnuz, float); CPU_ACC_TYPE(Float8_e4m3fn, float); +CPU_ACC_TYPE(Float8_e5m2fnuz, float); CPU_ACC_TYPE(Float8_e4m3fnuz, float); CPU_ACC_TYPE(float, double); CPU_ACC_TYPE(double, double); diff --git a/aten/src/ATen/BlasBackend.h b/aten/src/ATen/BlasBackend.h new file mode 100644 index 0000000000000..7f8c321ad9fa2 --- /dev/null +++ b/aten/src/ATen/BlasBackend.h @@ -0,0 +1,27 @@ +#pragma once + +#include + +#include +#include + +namespace at { + +enum class BlasBackend : int8_t { Cublas, Cublaslt }; + +inline std::string BlasBackendToString(at::BlasBackend backend) { + switch (backend) { + case BlasBackend::Cublas: + return "at::BlasBackend::Cublas"; + case BlasBackend::Cublaslt: + return "at::BlasBackend::Cublaslt"; + default: + TORCH_CHECK(false, "Unknown blas backend"); + } +} + +inline std::ostream& operator<<(std::ostream& stream, at::BlasBackend backend) { + return stream << BlasBackendToString(backend); +} + +} // namespace at diff --git a/aten/src/ATen/CMakeLists.txt b/aten/src/ATen/CMakeLists.txt index d4ccca9746654..583662e6c63d0 100644 --- a/aten/src/ATen/CMakeLists.txt +++ b/aten/src/ATen/CMakeLists.txt @@ -48,6 +48,7 @@ endif() # NB: If you edit these globs, you'll have to update setup.py package_data as well file(GLOB_RECURSE ATen_CORE_HEADERS "core/*.h") file(GLOB_RECURSE ATen_CORE_SRCS "core/*.cpp") +file(GLOB_RECURSE ATen_TRANSFORMER_HEADERS "native/transformers/*.h") if(NOT BUILD_LITE_INTERPRETER) file(GLOB_RECURSE ATen_CORE_TEST_SRCS "core/*_test.cpp") endif() @@ -60,19 +61,22 @@ endif() file(GLOB base_h "*.h" "detail/*.h" "cpu/*.h" "cpu/vec/vec512/*.h" "cpu/vec/vec256/*.h" "cpu/vec/vec256/vsx/*.h" "cpu/vec/vec256/zarch/*.h" "cpu/vec/*.h" "quantized/*.h" "functorch/*.h") file(GLOB base_cpp "*.cpp" "detail/*.cpp" "cpu/*.cpp" "functorch/*.cpp") -file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh") -file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp") +file(GLOB cuda_h "cuda/*.h" "cuda/detail/*.h" "cuda/*.cuh" "cuda/detail/*.cuh" "cuda/tunable/*.cuh" "cuda/tunable/*.h") +file(GLOB cuda_cpp "cuda/*.cpp" "cuda/detail/*.cpp" "cuda/tunable/*.cpp") file(GLOB cuda_nvrtc_stub_h "cuda/nvrtc_stub/*.h") file(GLOB cuda_nvrtc_stub_cpp "cuda/nvrtc_stub/*.cpp") -file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu") +file(GLOB cuda_cu "cuda/*.cu" "cuda/detail/*.cu" "cuda/tunable/*.cu") file(GLOB cudnn_h "cudnn/*.h" "cudnn/*.cuh") file(GLOB cudnn_cpp "cudnn/*.cpp") file(GLOB ops_h "ops/*.h") -file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h") -file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp") +file(GLOB xpu_h "xpu/*.h" "xpu/detail/*.h") +file(GLOB xpu_cpp "xpu/*.cpp" "xpu/detail/*.cpp") + +file(GLOB hip_h "hip/*.h" "hip/detail/*.h" "hip/*.cuh" "hip/detail/*.cuh" "hip/impl/*.h" "hip/tunable/*.cuh" "hip/tunable/*.h") +file(GLOB hip_cpp "hip/*.cpp" "hip/detail/*.cpp" "hip/impl/*.cpp" "hip/tunable/*.cpp") list(REMOVE_ITEM hip_cpp "${CMAKE_CURRENT_SOURCE_DIR}/hip/detail/LazyNVRTC.cpp") -file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip") +file(GLOB hip_hip "hip/*.hip" "hip/detail/*.hip" "hip/impl/*.hip" "hip/tunable/*.hip") file(GLOB hip_nvrtc_stub_h "hip/nvrtc_stub/*.h") file(GLOB hip_nvrtc_stub_cpp "hip/nvrtc_stub/*.cpp") file(GLOB miopen_h "miopen/*.h") @@ -81,6 +85,8 @@ file(GLOB miopen_cpp "miopen/*.cpp") file(GLOB mkl_cpp "mkl/*.cpp") file(GLOB mkldnn_cpp "mkldnn/*.cpp") +file(GLOB mkldnn_xpu_cpp "native/mkldnn/xpu/*.cpp" "native/mkldnn/xpu/detail/*.cpp") + file(GLOB native_cpp "native/*.cpp") file(GLOB native_mkl_cpp "native/mkl/*.cpp") file(GLOB native_mkldnn_cpp "native/mkldnn/*.cpp") @@ -138,6 +144,7 @@ file(GLOB native_sparse_cuda_cpp "native/sparse/cuda/*.cpp") file(GLOB native_quantized_cuda_cu "native/quantized/cuda/*.cu") file(GLOB native_quantized_cuda_cpp "native/quantized/cuda/*.cpp") file(GLOB native_quantized_cudnn_cpp "native/quantized/cudnn/*.cpp") +file(GLOB native_nested_h "native/nested/*.h") file(GLOB native_nested_cuda_cu "native/nested/cuda/*.cu") file(GLOB native_nested_cuda_cpp "native/nested/cuda/*.cpp") @@ -233,6 +240,20 @@ else() set(all_cpu_cpp ${all_cpu_cpp} ${vulkan_cpp}) endif() +if(USE_XPU) + list(APPEND ATen_XPU_SRCS ${mkldnn_xpu_cpp}) + list(APPEND ATen_XPU_DEPENDENCY_LIBS xpu_mkldnn) + + list(APPEND ATen_XPU_DEPENDENCY_LIBS ${OCL_LIBRARY}) + list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu) + list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/native/mkldnn/xpu/detail) + list(APPEND ATen_XPU_INCLUDE ${PROJECT_SOURCE_DIR}/third_party/ideep/mkl-dnn/include) + list(APPEND ATen_XPU_INCLUDE ${XPU_MKLDNN_INCLUDE}) + + list(APPEND ATen_XPU_INCLUDE ${SYCL_INCLUDE_DIR}) + list(APPEND ATen_XPU_DEPENDENCY_LIBS ${SYCL_LIBRARY}) +endif() + # Metal if(USE_PYTORCH_METAL_EXPORT) # Add files needed from exporting metal models(optimized_for_mobile) @@ -321,6 +342,11 @@ if(USE_ROCM) ) endif() +if(USE_XPU) + list(APPEND ATen_XPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/xpu) + list(APPEND ATen_XPU_SRCS ${xpu_cpp}) +endif() + list(APPEND ATen_CPU_INCLUDE ${CMAKE_CURRENT_SOURCE_DIR}/..) if(USE_TBB) @@ -410,50 +436,31 @@ if(NOT CMAKE_SYSTEM_PROCESSOR MATCHES "^(s390x|ppc64le)$") list(APPEND ATen_CPU_DEPENDENCY_LIBS cpuinfo) endif() -if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) - # Preserve values for the main build - set(__aten_sleef_build_shared_libs ${BUILD_SHARED_LIBS}) - set(__aten_sleef_build_tests ${BUILD_TESTS}) - - # Unset our restrictive C++ flags here and reset them later. - # Remove this once we use proper target_compile_options. - set(OLD_CMAKE_CXX_FLAGS ${CMAKE_CXX_FLAGS}) - set(CMAKE_CXX_FLAGS) - - # Bump up optimization level for sleef to -O1, since at -O0 the compiler - # excessively spills intermediate vector registers to the stack - # and makes things run impossibly slowly - set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) - if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") - string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) - else() - set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") +if(NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) + if(NOT MSVC) + # Bump up optimization level for sleef to -O1, since at -O0 the compiler + # excessively spills intermediate vector registers to the stack + # and makes things run impossibly slowly + set(OLD_CMAKE_C_FLAGS_DEBUG ${CMAKE_C_FLAGS_DEBUG}) + if(${CMAKE_C_FLAGS_DEBUG} MATCHES "-O0") + string(REGEX REPLACE "-O0" "-O1" CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) + else() + set(CMAKE_C_FLAGS_DEBUG "${CMAKE_C_FLAGS_DEBUG} -O1") + endif() endif() if(NOT USE_SYSTEM_SLEEF) - set(BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) - set(BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) - set(BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) - set(BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) - set(OLD_CMAKE_BUILD_TYPE ${CMAKE_BUILD_TYPE}) + set(SLEEF_BUILD_SHARED_LIBS OFF CACHE BOOL "Build sleef static" FORCE) + set(SLEEF_BUILD_DFT OFF CACHE BOOL "Don't build sleef DFT lib" FORCE) + set(SLEEF_BUILD_GNUABI_LIBS OFF CACHE BOOL "Don't build sleef gnuabi libs" FORCE) + set(SLEEF_BUILD_TESTS OFF CACHE BOOL "Don't build sleef tests" FORCE) + set(SLEEF_BUILD_SCALAR_LIB OFF CACHE BOOL "libsleefscalar will be built." FORCE) if(CMAKE_SYSTEM_NAME STREQUAL "Darwin") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR CMAKE_OSX_ARCHITECTURES MATCHES "arm64") set(DISABLE_SVE ON CACHE BOOL "Xcode's clang-12.5 crashes while trying to compile SVE code" FORCE) endif() endif() - if("${CMAKE_C_COMPILER_ID}" STREQUAL "GNU" AND - CMAKE_C_COMPILER_VERSION VERSION_GREATER 6.9 AND CMAKE_C_COMPILER_VERSION VERSION_LESS 8) - set(GCC_7 True) - else() - set(GCC_7 False) - endif() - if(GCC_7) - set(CMAKE_BUILD_TYPE Release) # Always build Sleef as a Release build to work around a gcc-7 bug - endif() add_subdirectory("${CMAKE_CURRENT_SOURCE_DIR}/../../../third_party/sleef" ${CMAKE_BINARY_DIR}/sleef) - if(GCC_7) - set(CMAKE_BUILD_TYPE ${OLD_CMAKE_BUILD_TYPE}) - endif() set_property(TARGET sleef PROPERTY FOLDER "dependencies") list(APPEND ATen_THIRD_PARTY_INCLUDE ${CMAKE_BINARY_DIR}/include) link_directories(${CMAKE_BINARY_DIR}/sleef/lib) @@ -468,12 +475,9 @@ if(NOT MSVC AND NOT EMSCRIPTEN AND NOT INTERN_BUILD_MOBILE) endif() list(APPEND ATen_CPU_DEPENDENCY_LIBS sleef) - set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) - set(CMAKE_CXX_FLAGS ${OLD_CMAKE_CXX_FLAGS}) - - # Set these back. TODO: Use SLEEF_ to pass these instead - set(BUILD_SHARED_LIBS ${__aten_sleef_build_shared_libs} CACHE BOOL "Build shared libs" FORCE) - set(BUILD_TESTS ${__aten_sleef_build_tests} CACHE BOOL "Build tests" FORCE) + if(NOT MSVC) + set(CMAKE_C_FLAGS_DEBUG ${OLD_CMAKE_C_FLAGS_DEBUG}) + endif() endif() if(USE_CUDA AND NOT USE_ROCM) @@ -577,9 +581,9 @@ configure_file(ATenConfig.cmake.in "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/AT install(FILES "${CMAKE_CURRENT_BINARY_DIR}/cmake-exports/ATenConfig.cmake" DESTINATION "${AT_INSTALL_SHARE_DIR}/cmake/ATen") -set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS}) +set(INSTALL_HEADERS ${base_h} ${ATen_CORE_HEADERS} ${native_nested_h} ${ATen_TRANSFORMER_HEADERS}) if(NOT INTERN_BUILD_MOBILE) - list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h}) + list(APPEND INSTALL_HEADERS ${native_h} ${native_cpu_h} ${native_ao_sparse_h} ${native_quantized_h} ${cuda_h} ${native_cuda_h} ${native_hip_h} ${cudnn_h} ${hip_h} ${xpu_h} ${mps_h} ${native_mps_h} ${native_utils_h} ${miopen_h}) # Metal if(USE_PYTORCH_METAL_EXPORT) # Add files needed from exporting metal models(optimized_for_mobile) @@ -641,6 +645,7 @@ list(APPEND ATen_MOBILE_BENCHMARK_SRCS # Pass source, includes, and libs to parent set(ATen_CORE_SRCS ${ATen_CORE_SRCS} PARENT_SCOPE) set(ATen_CPU_SRCS ${ATen_CPU_SRCS} PARENT_SCOPE) +set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CU_SRCS ${ATen_CUDA_CU_SRCS} PARENT_SCOPE) set(ATen_CUDA_CPP_SRCS ${ATen_CUDA_CPP_SRCS} PARENT_SCOPE) set(ATen_CUDA_LINALG_SRCS ${ATen_CUDA_LINALG_SRCS} PARENT_SCOPE) @@ -649,9 +654,11 @@ set(ATen_CUDA_CU_SRCS_W_SORT_BY_KEY ${ATen_CUDA_CU_SRCS_W_SORT_BY_KEY} PARENT_SC set(ATen_NVRTC_STUB_SRCS ${ATen_NVRTC_STUB_SRCS} PARENT_SCOPE) set(ATen_HIP_SRCS ${ATen_HIP_SRCS} PARENT_SCOPE) set(ATen_MPS_SRCS ${ATen_MPS_SRCS} PARENT_SCOPE) +set(ATen_XPU_SRCS ${ATen_XPU_SRCS} PARENT_SCOPE) set(ATen_QUANTIZED_SRCS ${ATen_QUANTIZED_SRCS} PARENT_SCOPE) set(ATen_CPU_TEST_SRCS ${ATen_CPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CUDA_TEST_SRCS ${ATen_CUDA_TEST_SRCS} PARENT_SCOPE) +set(ATen_XPU_TEST_SRCS ${ATen_XPU_TEST_SRCS} PARENT_SCOPE) set(ATen_CORE_TEST_SRCS ${ATen_CORE_TEST_SRCS} PARENT_SCOPE) set(ATen_HIP_TEST_SRCS ${ATen_HIP_TEST_SRCS} PARENT_SCOPE) set(ATen_VULKAN_TEST_SRCS ${ATen_VULKAN_TEST_SRCS} PARENT_SCOPE) @@ -664,9 +671,11 @@ set(ATen_CPU_INCLUDE ${ATen_CPU_INCLUDE} PARENT_SCOPE) set(ATen_THIRD_PARTY_INCLUDE ${ATen_THIRD_PARTY_INCLUDE} PARENT_SCOPE) set(ATen_CUDA_INCLUDE ${ATen_CUDA_INCLUDE} PARENT_SCOPE) set(ATen_HIP_INCLUDE ${ATen_HIP_INCLUDE} PARENT_SCOPE) +set(ATen_XPU_INCLUDE ${ATen_XPU_INCLUDE} PARENT_SCOPE) set(ATen_VULKAN_INCLUDE ${ATen_VULKAN_INCLUDE} PARENT_SCOPE) set(ATen_CPU_DEPENDENCY_LIBS ${ATen_CPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_CUDA_DEPENDENCY_LIBS ${ATen_CUDA_DEPENDENCY_LIBS} PARENT_SCOPE) +set(ATen_XPU_DEPENDENCY_LIBS ${ATen_XPU_DEPENDENCY_LIBS} PARENT_SCOPE) set(ATen_HIP_DEPENDENCY_LIBS ${ATen_HIP_DEPENDENCY_LIBS} PARENT_SCOPE) set(FLASH_ATTENTION_CUDA_SOURCES ${FLASH_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) set(MEM_EFF_ATTENTION_CUDA_SOURCES ${MEM_EFF_ATTENTION_CUDA_SOURCES} PARENT_SCOPE) diff --git a/aten/src/ATen/CPUApplyUtils.h b/aten/src/ATen/CPUApplyUtils.h index 225390ab0beea..5c524ef97c475 100644 --- a/aten/src/ATen/CPUApplyUtils.h +++ b/aten/src/ATen/CPUApplyUtils.h @@ -6,7 +6,6 @@ #include #include #include -#include namespace at { diff --git a/aten/src/ATen/CPUGeneratorImpl.cpp b/aten/src/ATen/CPUGeneratorImpl.cpp index bf4306fce5772..2d086ebbe71fe 100644 --- a/aten/src/ATen/CPUGeneratorImpl.cpp +++ b/aten/src/ATen/CPUGeneratorImpl.cpp @@ -141,8 +141,8 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { using detail::CPUGeneratorImplState; using detail::CPUGeneratorImplStateLegacy; - static_assert(std::is_standard_layout::value, "CPUGeneratorImplStateLegacy is not a PODType"); - static_assert(std::is_standard_layout::value, "CPUGeneratorImplState is not a PODType"); + static_assert(std::is_standard_layout_v, "CPUGeneratorImplStateLegacy is not a PODType"); + static_assert(std::is_standard_layout_v, "CPUGeneratorImplState is not a PODType"); static const size_t size_legacy = sizeof(CPUGeneratorImplStateLegacy); static const size_t size_current = sizeof(CPUGeneratorImplState); @@ -155,8 +155,7 @@ void CPUGeneratorImpl::set_state(const c10::TensorImpl& new_state) { auto double_normal_sample = c10::optional(); // Construct the state of at::CPUGeneratorImpl based on input byte tensor size. - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - CPUGeneratorImplStateLegacy* legacy_pod; + CPUGeneratorImplStateLegacy* legacy_pod{nullptr}; auto new_state_size = new_state.numel(); if (new_state_size == size_legacy) { legacy_pod = (CPUGeneratorImplStateLegacy*)new_state.data(); @@ -221,7 +220,7 @@ c10::intrusive_ptr CPUGeneratorImpl::get_state() const { using detail::CPUGeneratorImplState; static const size_t size = sizeof(CPUGeneratorImplState); - static_assert(std::is_standard_layout::value, "CPUGeneratorImplState is not a PODType"); + static_assert(std::is_standard_layout_v, "CPUGeneratorImplState is not a PODType"); auto state_tensor = at::detail::empty_cpu({(int64_t)size}, ScalarType::Byte, c10::nullopt, c10::nullopt, c10::nullopt, c10::nullopt); auto rng_state = state_tensor.data_ptr(); diff --git a/aten/src/ATen/Context.cpp b/aten/src/ATen/Context.cpp index da13bf05c4390..7fd191ef3f38c 100644 --- a/aten/src/ATen/Context.cpp +++ b/aten/src/ATen/Context.cpp @@ -7,12 +7,16 @@ #include #include #include +#include #include #ifdef USE_FBGEMM #include #endif // USE_FBGEMM +#if defined(__aarch64__) && !defined(C10_MOBILE) +#include +#endif namespace at { @@ -133,6 +137,15 @@ void Context::setSDPUseMath(bool e) { enabled_mathSDP = e; } +bool Context::userEnabledCuDNNSDP() const { + return enabled_cudnnSDP; +} + +void Context::setSDPUseCuDNN(bool e) { + enabled_cudnnSDP = e; +} + + // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) static const char cublas_config_var_name[] = "CUBLAS_WORKSPACE_CONFIG"; // NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays,modernize-avoid-c-arrays) @@ -165,7 +178,7 @@ void Context::alertCuBLASConfigNotDeterministic() const { "case, you must set an environment variable before running your PyTorch application: ", cublas_config_var_name, "=", cublas_deterministic_configs[0], " or ", cublas_config_var_name, "=", cublas_deterministic_configs[1], ". For more information, go to ", - "https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility" + "https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility" ); if (deterministicAlgorithmsWarnOnly()) { @@ -250,6 +263,30 @@ void Context::setLinalgPreferredBackend(at::LinalgBackend b) { } } +at::BlasBackend Context::blasPreferredBackend() const { + return blas_preferred_backend; +} + +void Context::setBlasPreferredBackend(at::BlasBackend b) { +#ifdef _MSC_VER + TORCH_WARN_ONCE( + "torch.backends.cuda.preferred_blas_library is an experimental feature. " + "It is not supported on Windows." + ); +#else + TORCH_CHECK((b != at::BlasBackend::Cublaslt) || hasCuBLASLt(), + "Cannot set preferred backend to cuBLASLt if PyTorch has not been compiled with cuBLASLt."); + if (b != at::BlasBackend::Cublas) { + TORCH_WARN_ONCE( + "torch.backends.cuda.preferred_blas_library is an experimental feature. " + "If you see any error or unexpected behavior when this flag is set " + "please file an issue on GitHub." + ); + } + blas_preferred_backend = b; +#endif +} + bool Context::allowFP16ReductionCuBLAS() const { return allow_fp16_reduction_cublas; } @@ -424,25 +461,23 @@ bool NoTF32Guard::should_disable_tf32() { return override_allow_tf32_flag; } -#ifdef USE_ROCM // Ops can query this flag to know they are in the backward pass. // This information can be used, for example, to select implementations // with different numerical or performance characteristics. // See https://pytorch.org/docs/stable/notes/numerical_accuracy.html for details. -thread_local bool ROCmBackwardPassGuard::is_backward_pass_; +thread_local bool rocm_is_backward_pass; ROCmBackwardPassGuard::ROCmBackwardPassGuard() { - is_backward_pass_ = true; + rocm_is_backward_pass = true; } ROCmBackwardPassGuard::~ROCmBackwardPassGuard() { - is_backward_pass_ = false; + rocm_is_backward_pass = false; } bool ROCmBackwardPassGuard::is_backward_pass() { - return is_backward_pass_; + return rocm_is_backward_pass; } -#endif bool Context::areVmapFallbackWarningsEnabled() const { return display_vmap_fallback_warnings_; @@ -469,4 +504,21 @@ void Context::unsetDefaultMobileCPUAllocator() { c10::SetCPUAllocator(prev_allocator_ptr_ , /*priority*/ 100); prev_allocator_ptr_ = nullptr; } + +bool Context::allowFP16ReductionCPU() const { + return allow_fp16_reduction_cpu; +} + +void Context::setAllowFP16ReductionCPU(bool b) { + if ( b && !allow_fp16_reduction_cpu) { + // Check that CPU supports fp16 reductions +#if defined(__aarch64__) && !defined(C10_MOBILE) + if (!cpuinfo_initialize() || !cpuinfo_has_arm_fp16_arith()) +#else + if (true) +#endif + throw std::runtime_error("Float16 arithmetic is not supported by the CPU!"); + } + allow_fp16_reduction_cpu = b; +} } // namespace at diff --git a/aten/src/ATen/Context.h b/aten/src/ATen/Context.h index 5baad73669af3..b50f0479e2fab 100644 --- a/aten/src/ATen/Context.h +++ b/aten/src/ATen/Context.h @@ -1,17 +1,20 @@ #pragma once +#include #include +#include #include #include #include #include #include +#include #include #include #include +#include #include #include -#include #include #include #include @@ -22,7 +25,6 @@ #include #include -#include #include namespace at { @@ -56,13 +58,34 @@ class TORCH_API Context { AT_ERROR(c10::DeviceTypeName(device_type), " device type not enabled."); } } + const AcceleratorHooksInterface& getAcceleratorHooksInterface( + c10::optional opt_device_type = c10::nullopt) { + c10::DeviceType device_type = opt_device_type.has_value() + ? opt_device_type.value() + : at::getAccelerator(true).value(); + if (device_type == at::kCUDA) { + return at::detail::getCUDAHooks(); + } else if (device_type == at::kMPS) { + return at::detail::getMPSHooks(); + } else if (device_type == at::kPrivateUse1) { + return at::detail::getPrivateUse1Hooks(); + } else if (device_type == at::kMTIA) { + return at::detail::getMTIAHooks(); + } else { + AT_ERROR( + c10::DeviceTypeName(device_type), " device type not an accelerator."); + } + } Device getDeviceFromPtr(void* data, c10::DeviceType device_type) { initCUDAIfNeeded(device_type); initHIPIfNeeded(device_type); + initXPUIfNeeded(device_type); if (device_type == at::kCPU) { return c10::DeviceType::CPU; } else if (device_type == at::kCUDA) { return at::detail::getCUDAHooks().getDeviceFromPtr(data); + } else if (device_type == at::kXPU) { + return at::detail::getXPUHooks().getDeviceFromPtr(data); } else if (device_type == at::kPrivateUse1) { return at::GetPrivateUse1HooksInterface()->getDeviceFromPtr(data); } else { @@ -100,6 +123,9 @@ class TORCH_API Context { static bool hasCuSOLVER() { return detail::getCUDAHooks().hasCuSOLVER(); } + static bool hasCuBLASLt() { + return detail::getCUDAHooks().hasCuBLASLt(); + } static bool hasHIP() { return detail::getHIPHooks().hasHIP(); } @@ -118,8 +144,8 @@ class TORCH_API Context { static bool hasLazy() { return c10::impl::hasDeviceGuardImpl(c10::DeviceType::Lazy); } - static bool hasORT() { - return c10::impl::hasDeviceGuardImpl(c10::DeviceType::ORT); + static bool hasMAIA() { + return c10::impl::hasDeviceGuardImpl(c10::DeviceType::MAIA); } // defined in header so that getNonVariableType has ability to inline // call_once check. getNonVariableType is called fairly frequently @@ -129,6 +155,12 @@ class TORCH_API Context { void lazyInitHIP() { c10::call_once(thh_init, [&] { detail::getHIPHooks().initHIP(); }); } + void lazyInitXPU() { + c10::call_once(thx_init, [&] { detail::getXPUHooks().initXPU(); }); + } + void lazyInitMTIA() { + c10::call_once(th_mtia_init, [&] { detail::getMTIAHooks().initMTIA(); }); + } void lazyInitPrivateUse1() { c10::call_once(thp_init, [&] { if (isPrivateUse1HooksRegistered()) { @@ -179,9 +211,15 @@ class TORCH_API Context { void setSDPUseMath(bool); bool userEnabledMathSDP() const; + void setSDPUseCuDNN(bool); + bool userEnabledCuDNNSDP() const; + at::LinalgBackend linalgPreferredBackend() const; void setLinalgPreferredBackend(at::LinalgBackend); + at::BlasBackend blasPreferredBackend() const; + void setBlasPreferredBackend(at::BlasBackend); + // Note [Enabling Deterministic Operations] // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Operations in PyTorch that normally act nondeterministically, but have an @@ -262,7 +300,7 @@ class TORCH_API Context { // Throws an error if `Context::deterministicAlgorithms()` is true, CUDA // >= 10.2, and CUBLAS_WORKSPACE_CONFIG is not set to either ":16:8" or // ":4096:8". For more details: - // https://docs.nvidia.com/cuda/cublas/index.html#cublasApi_reproducibility + // https://docs.nvidia.com/cuda/cublas/index.html#results-reproducibility void alertCuBLASConfigNotDeterministic() const; void setFloat32MatmulPrecision(const std::string& s); @@ -293,6 +331,8 @@ class TORCH_API Context { void setDefaultMobileCPUAllocator(); void unsetDefaultMobileCPUAllocator(); + bool allowFP16ReductionCPU() const; + void setAllowFP16ReductionCPU(bool); private: void initCUDAIfNeeded(c10::DeviceType p) { @@ -305,9 +345,16 @@ class TORCH_API Context { lazyInitHIP(); } } + void initXPUIfNeeded(c10::DeviceType p) { + if (p == c10::DeviceType::XPU) { + lazyInitXPU(); + } + } static bool checkCuBLASConfigDeterministic(); c10::once_flag thc_init; c10::once_flag thh_init; + c10::once_flag thx_init; + c10::once_flag th_mtia_init; c10::once_flag thp_init; bool enabled_cudnn = true; bool deterministic_cudnn = false; @@ -317,6 +364,7 @@ class TORCH_API Context { bool enabled_flashSDP = true; bool enabled_mem_efficientSDP = true; bool enabled_mathSDP = true; + bool enabled_cudnnSDP = false; #ifdef USE_ROCM bool benchmark_cudnn = true; #else @@ -336,6 +384,11 @@ class TORCH_API Context { c10::utils::check_env("TORCH_LINALG_PREFER_CUSOLVER") == true ? at::LinalgBackend::Cusolver : at::LinalgBackend::Default; + at::BlasBackend blas_preferred_backend = + (c10::utils::check_env("TORCH_BLAS_PREFER_CUBLASLT") == true || + c10::utils::check_env("TORCH_BLAS_PREFER_HIPBLASLT") == true) + ? at::BlasBackend::Cublaslt + : at::BlasBackend::Cublas; #ifdef C10_MOBILE bool release_original_weights = true; #else @@ -344,6 +397,7 @@ class TORCH_API Context { bool display_vmap_fallback_warnings_ = false; c10::optional quantized_engine = c10::nullopt; bool enable_sparse_tensor_invariant_checks = false; + bool allow_fp16_reduction_cpu = false; Allocator* prev_allocator_ptr_{nullptr}; }; @@ -407,8 +461,8 @@ static inline bool hasMPS() { return globalContext().hasMPS(); } -static inline bool hasORT() { - return globalContext().hasORT(); +static inline bool hasMAIA() { + return globalContext().hasMAIA(); } static inline bool hasXPU() { @@ -478,7 +532,7 @@ static inline void manual_seed(uint64_t seed) { } const auto xpu_num_gpus = detail::getXPUHooks().getNumGPUs(); - if (hasXPU() && xpu_num_gpus > 0) { + if (hasXPU() && xpu_num_gpus) { for (const auto i : c10::irange(xpu_num_gpus)) { auto xpu_gen = globalContext().defaultGenerator( Device(at::kXPU, static_cast(i))); @@ -515,15 +569,10 @@ struct TORCH_API NoTF32Guard { bool changed = false; }; -#ifdef USE_ROCM struct TORCH_API ROCmBackwardPassGuard { ROCmBackwardPassGuard(); ~ROCmBackwardPassGuard(); static bool is_backward_pass(); - - private: - static thread_local bool is_backward_pass_; }; -#endif } // namespace at diff --git a/aten/src/ATen/DLConvertor.cpp b/aten/src/ATen/DLConvertor.cpp index 8f2cac8206d4d..3d2350d261013 100644 --- a/aten/src/ATen/DLConvertor.cpp +++ b/aten/src/ATen/DLConvertor.cpp @@ -9,6 +9,13 @@ DLDataType getDLDataType(const Tensor& t) { dtype.lanes = 1; dtype.bits = t.element_size() * 8; switch (t.scalar_type()) { + case ScalarType::UInt1: + case ScalarType::UInt2: + case ScalarType::UInt3: + case ScalarType::UInt4: + case ScalarType::UInt5: + case ScalarType::UInt6: + case ScalarType::UInt7: case ScalarType::Byte: case ScalarType::UInt16: case ScalarType::UInt32: @@ -81,9 +88,9 @@ DLDataType getDLDataType(const Tensor& t) { return dtype; } -static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) { +static DLDevice getDLDevice(const Tensor& tensor, c10::DeviceIndex device_id) { DLDevice ctx; - ctx.device_id = device_id; + ctx.device_id = static_cast(device_id); switch (tensor.device().type()) { case DeviceType::CPU: ctx.device_type = DLDeviceType::kDLCPU; @@ -104,8 +111,9 @@ static DLDevice getDLDevice(const Tensor& tensor, const int64_t& device_id) { ctx.device_type = DLDeviceType::kDLROCM; break; case DeviceType::XPU: - ctx = at::detail::getXPUHooks().getDLPackDeviceFromATenDevice( - ctx, tensor.device(), tensor.data_ptr()); + ctx.device_type = DLDeviceType::kDLOneAPI; + ctx.device_id = + at::detail::getXPUHooks().getGlobalIdxFromDevice(tensor.device()); break; default: TORCH_CHECK(false, "Cannot pack tensors on " + tensor.device().str()); @@ -132,7 +140,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) { return at::Device(DeviceType::HIP, ctx.device_id); #endif case DLDeviceType::kDLOneAPI: - return at::detail::getXPUHooks().getATenDeviceFromDLPackDevice(ctx, data); + return at::detail::getXPUHooks().getDeviceFromPtr(data); default: TORCH_CHECK( false, "Unsupported device_type: " + c10::to_string(ctx.device_type)); @@ -140,7 +148,7 @@ static Device getATenDevice(const DLDevice& ctx, void* data) { } ScalarType toScalarType(const DLDataType& dtype) { - ScalarType stype; + ScalarType stype = ScalarType::Undefined; TORCH_CHECK(dtype.lanes == 1, "ATen does not support lanes != 1"); switch (dtype.code) { case DLDataTypeCode::kDLUInt: @@ -148,6 +156,15 @@ ScalarType toScalarType(const DLDataType& dtype) { case 8: stype = ScalarType::Byte; break; + case 16: + stype = ScalarType::UInt16; + break; + case 32: + stype = ScalarType::UInt32; + break; + case 64: + stype = ScalarType::UInt64; + break; default: TORCH_CHECK( false, "Unsupported kUInt bits " + c10::to_string(dtype.bits)); @@ -225,8 +242,7 @@ ScalarType toScalarType(const DLDataType& dtype) { } break; default: - TORCH_CHECK( - false, "Unsupported code " + c10::to_string(dtype.code)); + TORCH_CHECK(false, "Unsupported code " + c10::to_string(dtype.code)); } return stype; } @@ -248,7 +264,7 @@ DLManagedTensor* toDLPack(const Tensor& src) { // gh-83069 auto shape = src.sizes(); auto strides = src.strides().vec(); - for (int i=0; itensor.manager_ctx = atDLMTensor; atDLMTensor->tensor.deleter = &deleter; atDLMTensor->tensor.dl_tensor.data = view.data_ptr(); - int64_t device_id = 0; + c10::DeviceIndex device_id = 0; if (src.is_cuda()) { device_id = src.get_device(); } atDLMTensor->tensor.dl_tensor.device = getDLDevice(src, device_id); atDLMTensor->tensor.dl_tensor.ndim = src.dim(); atDLMTensor->tensor.dl_tensor.dtype = getDLDataType(src); - atDLMTensor->tensor.dl_tensor.shape = - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - const_cast(view.sizes().data()); - atDLMTensor->tensor.dl_tensor.strides = - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - const_cast(view.strides().data()); + atDLMTensor->tensor.dl_tensor.shape = view.sizes().data(); + atDLMTensor->tensor.dl_tensor.strides = view.strides().data(); atDLMTensor->tensor.dl_tensor.byte_offset = 0; return &(atDLMTensor->tensor); } -Tensor fromDLPack(const DLManagedTensor* src) { +Tensor fromDLPack(DLManagedTensor* src) { auto deleter = [src](void* self) { if (src->deleter) { - // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) - src->deleter(const_cast(src)); + src->deleter(src); } }; return fromDLPack(src, std::move(deleter)); } Tensor fromDLPack( - const DLManagedTensor* src, + DLManagedTensor* src, std::function deleter) { Device device = getATenDevice(src->dl_tensor.device, src->dl_tensor.data); ScalarType stype = toScalarType(src->dl_tensor.dtype); @@ -296,7 +307,7 @@ Tensor fromDLPack( return at::from_blob( src->dl_tensor.data, IntArrayRef(src->dl_tensor.shape, src->dl_tensor.ndim), - deleter, + std::move(deleter), at::device(device).dtype(stype), {device}); } @@ -306,6 +317,6 @@ Tensor fromDLPack( IntArrayRef(src->dl_tensor.strides, src->dl_tensor.ndim), deleter, at::device(device).dtype(stype), - { device }); + {device}); } } // namespace at diff --git a/aten/src/ATen/DLConvertor.h b/aten/src/ATen/DLConvertor.h index 9b8fce1015fe4..b35c9657527d8 100644 --- a/aten/src/ATen/DLConvertor.h +++ b/aten/src/ATen/DLConvertor.h @@ -12,9 +12,13 @@ namespace at { TORCH_API ScalarType toScalarType(const DLDataType& dtype); TORCH_API DLManagedTensor* toDLPack(const Tensor& src); -TORCH_API Tensor fromDLPack(const DLManagedTensor* src); +TORCH_API Tensor fromDLPack(DLManagedTensor* src); +C10_DEPRECATED_MESSAGE("Please migrate to a non-const variant") +inline Tensor fromDLPack(const DLManagedTensor* src) { + return fromDLPack(const_cast(src)); +} TORCH_API Tensor -fromDLPack(const DLManagedTensor* src, std::function deleter); +fromDLPack(DLManagedTensor* src, std::function deleter); TORCH_API DLDataType getDLDataType(const Tensor& t); TORCH_API DLDevice getDLContext(const Tensor& tensor, const int64_t& device_id); diff --git a/aten/src/ATen/DeviceAccelerator.cpp b/aten/src/ATen/DeviceAccelerator.cpp new file mode 100644 index 0000000000000..ec3cd2a2f5527 --- /dev/null +++ b/aten/src/ATen/DeviceAccelerator.cpp @@ -0,0 +1,39 @@ +#include +#include + +namespace at { + +C10_API std::optional getAccelerator(bool checked) { +#define CHECK_NO_CUDA \ + TORCH_CHECK(!at::hasCUDA(), "Cannot have both CUDA and PrivateUse1"); + +#define CHECK_NO_PU1 \ + TORCH_CHECK(!is_privateuse1_backend_registered(), "Cannot have both CUDA and PrivateUse1"); + +#define CHECK_NO_MTIA \ + TORCH_CHECK(!at::hasMTIA(), "Cannot have MTIA with other devices"); + + if (is_privateuse1_backend_registered()) { + // We explicitly allow PrivateUse1 and another device at the same time + // as we use this for testing. + // Whenever a PrivateUse1 device is registered, use it first. + return kPrivateUse1; + } else if (at::hasCUDA()) { + CHECK_NO_PU1 + CHECK_NO_MTIA + return kCUDA; + } else if (at::hasMTIA()) { + CHECK_NO_CUDA + CHECK_NO_PU1 + return kMTIA; + } else { + TORCH_CHECK(!checked, "Cannot access accelerator device when none is available.") + return std::nullopt; + } + +#undef CHECK_NO_CUDA +#undef CHECK_NO_PU1 +} + + +} // namespace at diff --git a/aten/src/ATen/DeviceAccelerator.h b/aten/src/ATen/DeviceAccelerator.h new file mode 100644 index 0000000000000..c3e800c7e07c6 --- /dev/null +++ b/aten/src/ATen/DeviceAccelerator.h @@ -0,0 +1,27 @@ +#pragma once + +#include +#include + +#include +#include + +// This file defines the top level Accelerator concept for PyTorch. +// A device is an accelerator per the definition here if: +// - It is mutually exclusive with all other accelerators +// - It performs asynchronous compute via a Stream/Event system +// - It provides a set of common APIs as defined by AcceleratorHooksInterface +// +// As of today, accelerator devices are (in no particular order): +// CUDA, MTIA, PrivateUse1 +// We want to add once all the proper APIs are supported and tested: +// HIP, MPS, XPU + +namespace at { + +// Ensures that only one accelerator is available (at +// compile time if possible) and return it. +// When checked is true, the returned optional always has a value. +TORCH_API std::optional getAccelerator(bool checked = false); + +} // namespace at diff --git a/aten/src/ATen/Dispatch_v2.h b/aten/src/ATen/Dispatch_v2.h index f5f41ac47647c..e0764834c02fd 100644 --- a/aten/src/ATen/Dispatch_v2.h +++ b/aten/src/ATen/Dispatch_v2.h @@ -112,12 +112,12 @@ // Ensure we never have too many scalar types for the expansion here to // support. To bump this, you must regenerate the macros below. -static_assert(static_cast(c10::ScalarType::NumOptions) < 32); +static_assert(static_cast(c10::ScalarType::NumOptions) < 45); // Python code to regenerate generate code below: #if 0 -num_args = 32 +num_args = 45 nums = ', '.join(str(i) for i in reversed(range(num_args+1))) args = ', '.join(f'_{i}' for i in range(1, num_args+1)) @@ -135,8 +135,8 @@ for i in range(1, num_args+1): // Begin generated code // clang-format off -#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) -#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, N, ...) N +#define AT_NUM_ARGS(...) AT_EXPAND(AT_NUM_ARGS_AUX(__VA_ARGS__, 45, 44, 43, 42, 41, 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)) +#define AT_NUM_ARGS_AUX(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45, N, ...) N #define AT_AP1(N, _1) AT_DISPATCH_CASE(_1, N) #define AT_AP2(N, _1, _2) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) #define AT_AP3(N, _1, _2, _3) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) @@ -169,6 +169,18 @@ for i in range(1, num_args+1): #define AT_AP30(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) #define AT_AP31(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) #define AT_AP32(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) - +#define AT_AP33(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) +#define AT_AP34(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) +#define AT_AP35(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) +#define AT_AP36(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) +#define AT_AP37(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) +#define AT_AP38(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) +#define AT_AP39(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) +#define AT_AP40(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) +#define AT_AP41(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) +#define AT_AP42(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) +#define AT_AP43(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) +#define AT_AP44(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) +#define AT_AP45(N, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _11, _12, _13, _14, _15, _16, _17, _18, _19, _20, _21, _22, _23, _24, _25, _26, _27, _28, _29, _30, _31, _32, _33, _34, _35, _36, _37, _38, _39, _40, _41, _42, _43, _44, _45) AT_DISPATCH_CASE(_1, N) AT_DISPATCH_CASE(_2, N) AT_DISPATCH_CASE(_3, N) AT_DISPATCH_CASE(_4, N) AT_DISPATCH_CASE(_5, N) AT_DISPATCH_CASE(_6, N) AT_DISPATCH_CASE(_7, N) AT_DISPATCH_CASE(_8, N) AT_DISPATCH_CASE(_9, N) AT_DISPATCH_CASE(_10, N) AT_DISPATCH_CASE(_11, N) AT_DISPATCH_CASE(_12, N) AT_DISPATCH_CASE(_13, N) AT_DISPATCH_CASE(_14, N) AT_DISPATCH_CASE(_15, N) AT_DISPATCH_CASE(_16, N) AT_DISPATCH_CASE(_17, N) AT_DISPATCH_CASE(_18, N) AT_DISPATCH_CASE(_19, N) AT_DISPATCH_CASE(_20, N) AT_DISPATCH_CASE(_21, N) AT_DISPATCH_CASE(_22, N) AT_DISPATCH_CASE(_23, N) AT_DISPATCH_CASE(_24, N) AT_DISPATCH_CASE(_25, N) AT_DISPATCH_CASE(_26, N) AT_DISPATCH_CASE(_27, N) AT_DISPATCH_CASE(_28, N) AT_DISPATCH_CASE(_29, N) AT_DISPATCH_CASE(_30, N) AT_DISPATCH_CASE(_31, N) AT_DISPATCH_CASE(_32, N) AT_DISPATCH_CASE(_33, N) AT_DISPATCH_CASE(_34, N) AT_DISPATCH_CASE(_35, N) AT_DISPATCH_CASE(_36, N) AT_DISPATCH_CASE(_37, N) AT_DISPATCH_CASE(_38, N) AT_DISPATCH_CASE(_39, N) AT_DISPATCH_CASE(_40, N) AT_DISPATCH_CASE(_41, N) AT_DISPATCH_CASE(_42, N) AT_DISPATCH_CASE(_43, N) AT_DISPATCH_CASE(_44, N) AT_DISPATCH_CASE(_45, N) // End generated code // clang-format on diff --git a/aten/src/ATen/DynamicLibrary.cpp b/aten/src/ATen/DynamicLibrary.cpp index f3287121b2e26..7dc27f38fa7f0 100644 --- a/aten/src/ATen/DynamicLibrary.cpp +++ b/aten/src/ATen/DynamicLibrary.cpp @@ -25,9 +25,7 @@ static void* checkDL(void* x) { return x; } -DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_) { - // NOLINTNEXTLINE(hicpp-signed-bitwise) - handle = dlopen(name, RTLD_LOCAL | RTLD_NOW); +DynamicLibrary::DynamicLibrary(const char* name, const char* alt_name, bool leak_handle_): leak_handle(leak_handle_), handle(dlopen(name, RTLD_LOCAL | RTLD_NOW)) { if (!handle) { if (alt_name) { handle = dlopen(alt_name, RTLD_LOCAL | RTLD_NOW); diff --git a/aten/src/ATen/EmptyTensor.cpp b/aten/src/ATen/EmptyTensor.cpp index 459960bbf86ad..0b35fc67b53ac 100644 --- a/aten/src/ATen/EmptyTensor.cpp +++ b/aten/src/ATen/EmptyTensor.cpp @@ -1,6 +1,9 @@ #define TORCH_ASSERT_NO_OPERATORS #include #include +#include +#include +#include #include #include @@ -10,7 +13,18 @@ namespace at::detail { namespace { c10::Allocator* GetCPUAllocatorMaybePinned(bool pin_memory) { if (pin_memory) { - return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); + // NB: This is not quite right, if you somehow had both CUDA and PrivateUse1 initialized + // in the same PyTorch build, you would ONLY ever get the CUDA pinned memory allocator. + // To properly support this, see https://github.com/pytorch/pytorch/issues/14560 + if (at::globalContext().hasCUDA()) { + return at::detail::getCUDAHooks().getPinnedMemoryAllocator(); + } else if (at::globalContext().hasXPU()) { + return at::detail::getXPUHooks().getPinnedMemoryAllocator(); + } else if(at::isPrivateUse1HooksRegistered()) { + return at::GetPrivateUse1HooksInterface()->getPinnedMemoryAllocator(); + } else { + TORCH_CHECK(false, "Need to provide pin_memory allocator to use pin memory.") + } } return c10::GetCPUAllocator(); } @@ -80,7 +94,7 @@ size_t computeStorageNbytes( return 0; } - uint64_t strided_size; + uint64_t strided_size = 0; overflowed |= c10::mul_overflows(strides[i], sizes[i] - 1, &strided_size); overflowed |= c10::add_overflows(size, strided_size, &size); } @@ -134,7 +148,7 @@ SymInt computeStorageNbytes( // of the last element according to stride SymInt size = 1; for (const auto i : c10::irange(sizes.size())) { - if (sizes[i] == 0) { + if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[i].sym_eq(0))) { return 0; } @@ -187,6 +201,15 @@ TensorBase empty_generic( return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt); } +TensorBase empty_generic_symint( + SymIntArrayRef size, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type, + c10::optional memory_format_opt) { + return _empty_generic(size, allocator, ks, scalar_type, memory_format_opt); +} + template TensorBase _empty_strided_generic( T size, @@ -305,12 +328,13 @@ struct MetaAllocator final : public at::Allocator { static void deleter(void* const pointer) { TORCH_INTERNAL_ASSERT(!pointer); } - DataPtr allocate(const size_t nbytes) const override { + DataPtr allocate(const size_t nbytes) override { return {nullptr, nullptr, &deleter, at::Device(DeviceType::Meta)}; } DeleterFnPtr raw_deleter() const override { return deleter; } + void copy_data(void* dest, const void* src, std::size_t count) const final {} }; static MetaAllocator g_meta_alloc; diff --git a/aten/src/ATen/EmptyTensor.h b/aten/src/ATen/EmptyTensor.h index 5f8681ce37f96..f6e2e53bc99f5 100644 --- a/aten/src/ATen/EmptyTensor.h +++ b/aten/src/ATen/EmptyTensor.h @@ -51,6 +51,13 @@ TORCH_API TensorBase empty_generic( ScalarType scalar_type, c10::optional memory_format_opt); +TORCH_API TensorBase empty_generic_symint( + SymIntArrayRef size, + c10::Allocator* allocator, + c10::DispatchKeySet ks, + ScalarType scalar_type, + c10::optional memory_format_opt); + TORCH_API TensorBase empty_strided_generic( IntArrayRef size, IntArrayRef stride, diff --git a/aten/src/ATen/ExpandUtils.cpp b/aten/src/ATen/ExpandUtils.cpp index d066f99242ab0..cfa2f63a5b8a8 100644 --- a/aten/src/ATen/ExpandUtils.cpp +++ b/aten/src/ATen/ExpandUtils.cpp @@ -15,13 +15,13 @@ namespace { // NOTE: are_expandable did a similar check, please keep them sync if change is needed template Container infer_size_impl(ArrayType a, ArrayType b) { - size_t dimsA = a.size(); - size_t dimsB = b.size(); - size_t ndim = dimsA > dimsB ? dimsA : dimsB; + // Use ptrdiff_t to ensure signed comparison. + auto dimsA = static_cast(a.size()); + auto dimsB = static_cast(b.size()); + auto ndim = dimsA > dimsB ? dimsA : dimsB; Container expandedSizes(ndim); - // Use ptrdiff_t to ensure signed comparison. - for (ptrdiff_t i = (ptrdiff_t)ndim - 1; i >= 0; --i) { + for (ptrdiff_t i = ndim - 1; i >= 0; --i) { ptrdiff_t offset = ndim - 1 - i; ptrdiff_t dimA = dimsA - 1 - offset; ptrdiff_t dimB = dimsB - 1 - offset; @@ -63,8 +63,8 @@ C10_ALWAYS_INLINE InferExpandGeometryResult inferExpandGeometryImpl( IntArrayRef tensor_sizes, IntArrayRef tensor_strides, IntArrayRef sizes) { - int64_t ndim = sizes.size(); - int64_t tensor_dim = tensor_sizes.size(); + int64_t ndim = static_cast(sizes.size()); + int64_t tensor_dim = static_cast(tensor_sizes.size()); if (tensor_dim == 0) { return InferExpandGeometryResult(sizes, ndim); diff --git a/aten/src/ATen/ExpandUtils.h b/aten/src/ATen/ExpandUtils.h index 82db1f8b6517c..03cfca36e7227 100644 --- a/aten/src/ATen/ExpandUtils.h +++ b/aten/src/ATen/ExpandUtils.h @@ -462,7 +462,8 @@ inline Tensor _sum_to( reduce_dims.push_back(i); } for (int64_t i = leading_dims; i < static_cast(sizes.size()); ++i) { - if (shape[i - leading_dims] == 1 && sizes[i] != 1) { + if (shape[i - leading_dims] == 1 && + TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(sizes[i], 1))) { reduce_dims.push_back(i); } } diff --git a/aten/src/ATen/FunctionalInverses.cpp b/aten/src/ATen/FunctionalInverses.cpp index 953636df5abc9..ebc24085a74a8 100644 --- a/aten/src/ATen/FunctionalInverses.cpp +++ b/aten/src/ATen/FunctionalInverses.cpp @@ -174,8 +174,8 @@ Tensor FunctionalInverses::expand_inverse(const Tensor& base, const Tensor& muta return mutated_view.as_strided_symint( base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); } else { - return at::sum_to( - mutated_view, + return base + at::sum_to( + mutated_view - base, base.sym_sizes(), /*always_return_non_view=*/inverse_return_mode == InverseReturnMode::NeverView ); @@ -224,48 +224,48 @@ Tensor FunctionalInverses::slice_Tensor_inverse(const Tensor& base, const Tensor if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization - return mutated_view.as_strided_symint( - base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); + return mutated_view.slice_inverse_symint( + base, dim, std::move(start), std::move(end), std::move(step)); } else { return base.slice_scatter_symint(mutated_view, dim, std::move(start), std::move(end), std::move(step)); } } Tensor FunctionalInverses::split_Tensor_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymInt split_size, int64_t dim) { + // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can. + // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i + // on top of the base tensor. + // For autograd, we have all of the tensors outputted by split() and we just want to stack them. + dim = at::maybe_wrap_dim(dim, base.dim()); + auto dim_size = base.sym_size(dim); + auto start = split_size * mutated_view_idx; + auto end = split_size + start; + if (end > dim_size) end = dim_size; + if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization - return mutated_view.as_strided_symint( - base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); + return mutated_view.slice_inverse_symint(base, dim, start, end, 1); } else { - // It would be nice if this logic could be re-used from autograd's split_backward(), but I don't think it can. - // For functionalization, we have only have one of the tensors from the TensorList outputed by split(), and we want to layer i - // on top of the base tensor. - // For autograd, we have all of the tensors outputted by split() and we just want to stack them. - dim = at::maybe_wrap_dim(dim, base.dim()); - auto dim_size = base.sym_size(dim); - auto start = split_size * mutated_view_idx; - auto end = split_size + start; - if (end > dim_size) end = dim_size; return base.slice_scatter_symint(mutated_view, dim, start, end, 1); } } Tensor FunctionalInverses::split_with_sizes_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t mutated_view_idx, c10::SymIntArrayRef split_sizes, int64_t dim) { + dim = at::maybe_wrap_dim(dim, base.dim()); + auto dim_size = base.sym_size(dim); + c10::SymInt start = 0; + for (auto i = 0; i < mutated_view_idx; ++i) { + start += split_sizes[i]; + } + auto end = start + split_sizes[mutated_view_idx]; + if (end > dim_size) end = dim_size; + if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization - return mutated_view.as_strided_symint( - base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); + return mutated_view.slice_inverse_symint(base, dim, start, end, 1); } else { - dim = at::maybe_wrap_dim(dim, base.dim()); - auto dim_size = base.sym_size(dim); - c10::SymInt start = 0; - for (auto i = 0; i < mutated_view_idx; ++i) { - start += split_sizes[i]; - } - auto end = start + split_sizes[mutated_view_idx]; - if (end > dim_size) end = dim_size; return base.slice_scatter_symint(mutated_view, dim, start, end, 1); } } @@ -303,6 +303,29 @@ Tensor FunctionalInverses::_nested_view_from_buffer_inverse(const Tensor& base, return Tensor(); } +Tensor FunctionalInverses::_nested_view_from_jagged_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, const Tensor& offsets, const Tensor& dummy, const std::optional& lengths, int64_t ragged_idx) { + auto values = at::_nested_get_values(mutated_view); + if (inverse_return_mode != InverseReturnMode::NeverView) { + return values; + } else { + return values.clone(/*memory_format=*/at::MemoryFormat::Contiguous); + } +} + +Tensor FunctionalInverses::_nested_get_values_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode) { + auto offsets = at::_nested_get_offsets(base); + auto lengths = at::_nested_get_lengths(base); + auto ragged_idx = at::_nested_get_ragged_idx(base); + auto dummy = at::_nested_get_jagged_dummy(base); + auto nt = at::_nested_view_from_jagged(mutated_view, offsets, dummy, lengths, ragged_idx); + + if (inverse_return_mode != InverseReturnMode::NeverView) { + return nt; + } else { + return nt.clone(/*memory_format=*/at::MemoryFormat::Contiguous); + } +} + Tensor FunctionalInverses::unsqueeze_inverse(const Tensor& base, const Tensor& mutated_view, InverseReturnMode inverse_return_mode, int64_t dim) { if (inverse_return_mode != InverseReturnMode::NeverView) { return at::squeeze(mutated_view, dim); @@ -428,12 +451,22 @@ Tensor FunctionalInverses::narrow_inverse(const at::Tensor & base, const at::Ten if (inverse_return_mode == InverseReturnMode::AlwaysView) { // NB: assumes mutated_view is a narrowed view of base. // We should NOT do this for functionalization - return mutated_view.as_strided_symint( - base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); + return mutated_view.slice_inverse_symint(base, dim, std::move(start), start + length, 1); } else { return base.slice_scatter_symint( mutated_view, dim, std::move(start), start + length, 1); } } +Tensor FunctionalInverses::slice_inverse_inverse(const at::Tensor & base, const at::Tensor & mutated_view, InverseReturnMode inverse_return_mode, const at::Tensor & src, int64_t dim, std::optional start, std::optional end, c10::SymInt step) { + // slice_inverse() inverse is just slice() + if (inverse_return_mode == InverseReturnMode::NeverView) { + return at::slice_copy_symint( + mutated_view, dim, std::move(start), std::move(end), std::move(step)); + } else { + return mutated_view.slice_symint( + dim, std::move(start), std::move(end), std::move(step)); + } +} + } // namespace at::functionalization diff --git a/aten/src/ATen/FunctionalStorageImpl.cpp b/aten/src/ATen/FunctionalStorageImpl.cpp index a2f486b7db681..78a5b6a9cfbe9 100644 --- a/aten/src/ATen/FunctionalStorageImpl.cpp +++ b/aten/src/ATen/FunctionalStorageImpl.cpp @@ -10,7 +10,7 @@ namespace at::functionalization { ViewMeta ViewMeta::to_out_idx(int64_t out_idx) { if (out_idx == this->out_index) return *this; - return ViewMeta(forward_fn, reverse_fn, is_multi_output, out_idx); + return ViewMeta(forward_fn, reverse_fn, is_multi_output, is_as_strided, out_idx); } // Note [Functionalization: Alias Removal Part 2] @@ -94,7 +94,7 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) get_nbytes(base), DataPtr{nullptr, base.device()}, GetAllocator(kMeta), - /*resizeable=*/true + /*resizable=*/true ), base_(base) { @@ -103,6 +103,18 @@ FunctionalStorageImpl::FunctionalStorageImpl(const Tensor& base) void FunctionalStorageImpl::add_update(const Tensor& updated_val, const std::vector& metas) { TORCH_CHECK(!frozen_, "cannot mutate tensors with frozen storage"); + + if (metas.size() > 1) { + for (size_t i = 1; i < metas.size(); ++i) { + // Skipping this check for XLA. Would be good to add it back, but it is failing XLA CI + TORCH_CHECK(updated_val.device().type() == c10::DeviceType::XLA || !metas[i].is_as_strided, +"During torch.compile, encountered a mutation on a view chain of length ", metas.size(), ", where view ", i, +" was an as_strided() call. as_strided() is non-compositional, and therefore is not possible to functionalize properly today," +"so this behavior is banned in compile. As a workaround, you can either remove the mutation from the model code, or you " +"can insert a graph break right before the mutation with torch._dynamo.graph_break(). If you would like this behavior to " +"work properly, please comment on https://github.com/pytorch/pytorch/issues/104505."); + } + } updates_.push_back({updated_val, metas}); generation_++; } diff --git a/aten/src/ATen/FunctionalStorageImpl.h b/aten/src/ATen/FunctionalStorageImpl.h index 52c01c3a53c0c..8d899fe01624a 100644 --- a/aten/src/ATen/FunctionalStorageImpl.h +++ b/aten/src/ATen/FunctionalStorageImpl.h @@ -32,11 +32,13 @@ struct ViewMeta { std::function forward, std::function reverse, bool is_multi_output = false, + bool is_as_strided = false, int64_t out_idx = 0) : forward_fn(std::move(forward)), reverse_fn(std::move(reverse)), out_index(out_idx), - is_multi_output(is_multi_output) {} + is_multi_output(is_multi_output), + is_as_strided(is_as_strided) {} std::function forward_fn; std::function reverse_fn; @@ -46,6 +48,8 @@ struct ViewMeta { // Tells us if this is a multi-output view bool is_multi_output; + bool is_as_strided; + // Returns a copy of the current ViewMeta, if out_idx matches the current // out_index. Otherwise, returns a new ViewMeta with the same forward/reverse // functions, but a new out index. @@ -79,7 +83,9 @@ struct ViewMeta { struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { public: struct Update { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const at::Tensor new_val; + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const std::vector view_metas; }; @@ -101,6 +107,31 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { ~FunctionalStorageImpl() override = default; + void mark_mutation() { + mutation_counter_++; + } + void mark_mutation_during_no_grad_or_inference_mode() { + mutation_counter_during_no_grad_or_inference_mode_++; + } + void mark_mutation_hidden_from_autograd() { + mutation_counter_hidden_from_autograd_++; + } + + bool are_all_mutations_under_no_grad_or_inference_mode() const { + auto non_autograd_mutations = + mutation_counter_during_no_grad_or_inference_mode_ + + mutation_counter_hidden_from_autograd_; + // The <= is because both counters will technically be incremented, if we + // perform e.g. a triton kernel mutation under no_grad + return mutation_counter_ <= non_autograd_mutations; + } + + bool are_all_mutations_hidden_from_autograd() const { + // mutations under no_grad / inference_mode are technically not hidden from + // autograd - they change the version counter + return mutation_counter_ <= mutation_counter_hidden_from_autograd_; + } + private: // NB: base_ should always point to a tensor BELOW the current // functionalization layer. This is mainly to avoid reference cycles. e.g. @@ -119,6 +150,28 @@ struct TORCH_API FunctionalStorageImpl : public c10::StorageImpl { // If frozen, no more mutations are allowed on this storage. Once frozen, a // storage cannot be unfrozen. bool frozen_ = false; + + // These mutation counters are bumped on the storage + // whenever a FunctionalTensorWrapper experiences a mutation. + // When the mutation is under no_grad, or comes from a triton kernel, we also + // bump the corresponding during_no_grad or hidden_from_autograd counters. Why + // do we need to detect these two situations separately from "normal" input + // mutations? (1) "normal" input mutations can mutate autograd metadata like + // .grad_fn, + // in which case they need to be replayed outside of the compiled graph + // (2) "no_grad" input mutations are generally safe to keep in the graph (and + // compile), + // but they bump the tensor's VC, so we need to mark_dirty() on the inputs + // in torch.compile + // (3) mutations that are fully hidden from autograd (e.g. from a triton + // kernel) + // do not mutate any autograd state, and be fully kept in the graph + // When we detect that an input was mutated, we need to be able to tell if: + // (1) all of the mutations were from triton kernels + // (2) all of the mutations were under no_grad + uint64_t mutation_counter_during_no_grad_or_inference_mode_ = 0; + uint64_t mutation_counter_ = 0; + uint64_t mutation_counter_hidden_from_autograd_ = 0; }; } // namespace at::functionalization diff --git a/aten/src/ATen/FunctionalTensorWrapper.cpp b/aten/src/ATen/FunctionalTensorWrapper.cpp index bd260f241e00e..a7ba697d13932 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.cpp +++ b/aten/src/ATen/FunctionalTensorWrapper.cpp @@ -129,7 +129,7 @@ void FunctionalTensorWrapper::freeze_storage() const { // - view_value: The output tensor that we need to wrap. // - base: The "base" of the view that `view_value` was generated from. // See Note [Functionalization: Alias Removal Part 2] for more details on the mutation replay logic. -FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, functionalization::ViewMeta meta) +FunctionalTensorWrapper::FunctionalTensorWrapper(const Tensor& view_value, const FunctionalTensorWrapper* base, const functionalization::ViewMeta& meta) : c10::TensorImpl( c10::DispatchKeySet(DispatchKey::Functionalize), view_value.dtype(), @@ -174,7 +174,7 @@ bool FunctionalTensorWrapper::is_up_to_date() const { } // See Note [Functionalization Pass - Inplace View Ops] -void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta meta) { +void FunctionalTensorWrapper::mutate_view_meta(const at::functionalization::ViewMeta& meta) { view_metas_.push_back(meta); // Manually track the fact that this tensor recieved a metadata mutation! has_metadata_mutation_ = true; @@ -212,7 +212,7 @@ void FunctionalTensorWrapper::mutate_view_meta(at::functionalization::ViewMeta m // In the above, tmp is a batched tensor (because adding a normal tensor to a batched tensor does broadcasting and creates a batched tensor). // But we can't just replace the underlying memory backing `tensor` with `tmp` - a batched tensor takes up more space! // Instead, every input, intermediate and output of the program is wrapped in a FunctionalTensorImpl, which wraps the underlying tensor. -void FunctionalTensorWrapper::replace_(const Tensor& other) { +void FunctionalTensorWrapper::replace_(const Tensor& other, bool from_lazy_regenerate) { // TODO: going to need to change this if we want nested functionalize() transforms. TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(other)); value_ = other; @@ -231,10 +231,19 @@ void FunctionalTensorWrapper::replace_(const Tensor& other) { value_ = at::_to_copy(value_, c10::TensorOptions().dtype(dtype()).layout(layout())); TORCH_INTERNAL_ASSERT(!value_.key_set().has(c10::DispatchKey::Functionalize)); } - mutation_counter_++; - if (!at::GradMode::is_enabled() || InferenceMode::is_enabled()) { - // This mutation happened under no_grad or inference_mode - mark_mutation_during_no_grad_or_inference_mode(); + // might not be until after the no_grad region is exited. + // Therefore, replace_() is not unconditionally safe to check the current no_grad state. + // If this is a lazy regeneration, then it is guaranteed that we have already + // done the mutation for the storage alias (when we originally performed the mutation), + // so no counter update may be needed. + // Example: if a mutation happens to a view under a no_grad, + // we won't call replace_() on the other alias until the alias is later used, which + if (!from_lazy_regenerate) { + mark_mutation(); + if (!at::GradMode::is_enabled() || InferenceMode::is_enabled()) { + // This mutation happened under no_grad or inference_mode + mark_mutation_during_no_grad_or_inference_mode(); + } } } @@ -328,17 +337,27 @@ void FunctionalTensorWrapper::sync_() { regenerate_from_base(); } +Tensor FunctionalTensorWrapper::apply_view_metas(const Tensor& base) { + auto t = base; + + // Reapply views to get the viewed tensor from the base in alias_ + for (auto& view_meta: view_metas_) { + t = view_meta.forward_fn(t, view_meta.out_index); + } + + return t; +} + void FunctionalTensorWrapper::regenerate_from_base() { at::AutoDispatchSkipFunctionalize guard; auto storage_impl = functional_storage_impl(); auto t = storage_impl->base(); + TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - // Reapply views to get the viewed tensor from the base in alias_ - for (auto& view_meta: view_metas_) { - t = view_meta.forward_fn(t, view_meta.out_index); - } + t = apply_view_metas(t); TORCH_INTERNAL_ASSERT(!at::functionalization::impl::isFunctionalTensor(t)); - replace_(t); + + replace_(t, /*from_lazy_regenerate=*/true); generation_ = storage_impl->generation(); } @@ -352,6 +371,38 @@ const char* FunctionalTensorWrapper::tensorimpl_type_name() const { return "FunctionalTensorWrapper"; } +void FunctionalTensorWrapper::copy_tensor_metadata( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) { + TensorImpl::copy_tensor_metadata( + src_impl, + dest_impl, + version_counter, + allow_tensor_metadata_change); + + // FunctionalTensorWrapper-specific fields. + dest_impl->value_ = src_impl->value_; + dest_impl->level_ = src_impl->level_; + dest_impl->has_metadata_mutation_ = src_impl->has_metadata_mutation_; + dest_impl->is_multi_output_view_ = src_impl->is_multi_output_view_; + dest_impl->was_storage_changed_ = src_impl->was_storage_changed_; + dest_impl->generation_ = src_impl->generation_; + dest_impl->view_metas_ = src_impl->view_metas_; +} + + +void FunctionalTensorWrapper::copy_tensor_metadata_and_refresh( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const { + copy_tensor_metadata(src_impl, dest_impl, version_counter, allow_tensor_metadata_change); + dest_impl->refresh_numel(); + dest_impl->refresh_contiguous(); +} + template c10::intrusive_ptr FunctionalTensorWrapper::shallow_copy_and_detach_core( VariableVersion&& version_counter, @@ -367,16 +418,11 @@ c10::intrusive_ptr FunctionalTensorWrapper::shallow_copy_and_detach_ } auto impl = c10::make_intrusive(value_); - copy_tensor_metadata( + copy_tensor_metadata_and_refresh( /*src_impl=*/this, /*dest_impl=*/impl.get(), /*version_counter=*/std::forward(version_counter), /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); - impl->level_ = level_; - impl->generation_ = generation_; - impl->view_metas_ = view_metas_; - impl->refresh_numel(); - impl->refresh_contiguous(); return impl; } @@ -394,6 +440,18 @@ c10::intrusive_ptr FunctionalTensorWrapper::shallow_copy_and_detach( std::move(version_counter), allow_tensor_metadata_change); } +void FunctionalTensorWrapper::shallow_copy_from(const c10::intrusive_ptr& impl) { + AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set())); + auto functional_impl = + static_cast(impl.get()); + copy_tensor_metadata_and_refresh( + /*src_impl=*/functional_impl, + /*dest_impl=*/this, + /*version_counter=*/version_counter(), + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); +} + + c10::Device FunctionalTensorWrapper::device_custom() const { return value_.unsafeGetTensorImpl()->device(); } @@ -442,8 +500,8 @@ c10::optional to_functional_tensor(const c10::optional& tensor) } return c10::nullopt; } -c10::List> to_functional_tensor(const c10::List>& t_list) { - c10::List> outputs; +c10::List<::std::optional> to_functional_tensor(const c10::List<::std::optional>& t_list) { + c10::List<::std::optional> outputs; outputs.reserve(t_list.size()); for (const auto i : c10::irange(t_list.size())) { outputs.push_back(to_functional_tensor(t_list[i])); @@ -494,8 +552,8 @@ std::vector from_functional_tensor(ITensorListRef t_list) { } return outputs; } -c10::List> from_functional_tensor(const c10::List>& t_list) { - c10::List> outputs; +c10::List<::std::optional> from_functional_tensor(const c10::List<::std::optional>& t_list) { + c10::List<::std::optional> outputs; outputs.reserve(t_list.size()); for (const auto i : c10::irange(t_list.size())) { outputs.push_back(from_functional_tensor(t_list[i], /*assert_functional=*/false)); @@ -530,7 +588,7 @@ void sync(ITensorListRef t_list) { sync(t); } } -void sync(const c10::List>& t_list) { +void sync(const c10::List<::std::optional>& t_list) { for (const auto i : c10::irange(t_list.size())) { sync(t_list[i]); } @@ -610,7 +668,7 @@ bool isFunctionalTensor(const c10::optional& t) { } } -bool isFunctionalTensor(const c10::List>& t_list) { +bool isFunctionalTensor(const c10::List<::std::optional>& t_list) { if (t_list.empty()) return false; auto functional_count = 0; for (const auto i : c10::irange(t_list.size())) { @@ -658,7 +716,7 @@ Tensor create_functional_tensor_with_view_meta(const at::Tensor& view_to_wrap, c return at::detail::make_tensor(view_to_wrap, functional_base_impl, meta); } -std::vector create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, functionalization::ViewMeta meta) { +std::vector create_functional_tensor_with_view_meta(ITensorListRef view_to_wrap, const at::Tensor& base, const functionalization::ViewMeta& meta) { std::vector outputs(view_to_wrap.size()); int64_t i = 0; for (const auto& tensor : view_to_wrap) { @@ -668,10 +726,10 @@ std::vector create_functional_tensor_with_view_meta(ITensorListRef view_ return outputs; } -void mutate_view_meta(const at::Tensor& self, functionalization::ViewMeta meta) { +void mutate_view_meta(const at::Tensor& self, const functionalization::ViewMeta& meta) { TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); - self_impl->mutate_view_meta(std::move(meta)); + self_impl->mutate_view_meta(meta); } // Note [Propagating strides in the functionalization pass] diff --git a/aten/src/ATen/FunctionalTensorWrapper.h b/aten/src/ATen/FunctionalTensorWrapper.h index 1dd9104968592..d3237080535c0 100644 --- a/aten/src/ATen/FunctionalTensorWrapper.h +++ b/aten/src/ATen/FunctionalTensorWrapper.h @@ -56,7 +56,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { explicit FunctionalTensorWrapper( const Tensor& view_value, const FunctionalTensorWrapper* base, - functionalization::ViewMeta meta); + const functionalization::ViewMeta& meta); // Get the underlying, actual tensor, that doesn't know anything about // functionalization. @@ -75,26 +75,32 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { return has_metadata_mutation_; }; + void mark_mutation() { + functional_storage_impl()->mark_mutation(); + } // Denotes a mutation that's hidden from autograd, // e.g. for the purposes of passing a tensor to a triton kernel void mark_mutation_hidden_from_autograd() { - mutation_hidden_from_autograd_counter_++; + functional_storage_impl()->mark_mutation_hidden_from_autograd(); } void mark_mutation_during_no_grad_or_inference_mode() { - mutation_during_no_grad_or_inference_mode_++; + functional_storage_impl()->mark_mutation_during_no_grad_or_inference_mode(); } // Are all the mutations happening to the tensor hidden from autograd bool are_all_mutations_hidden_from_autograd() const { - return mutation_hidden_from_autograd_counter_ == mutation_counter_; + return functional_storage_impl()->are_all_mutations_hidden_from_autograd(); } // Did all mutations happen under no_grad or inference_mode // (We also need to ignore mutations fully hidden from autograd here) bool are_all_mutations_under_no_grad_or_inference_mode() const { - return mutation_hidden_from_autograd_counter_ + - mutation_during_no_grad_or_inference_mode_ == - mutation_counter_; + return functional_storage_impl() + ->are_all_mutations_under_no_grad_or_inference_mode(); } + // Runs the forward_fn of every ViewMeta collected in the current instance + // to some other base. + Tensor apply_view_metas(const Tensor& base); + // Sync's the underlying tensor with its alias, if it's out of date. This // involves two steps: 1) Apply any pending updates/mutations to the alias 2) // Replay the views (if any) to regenerate the current tensor off of the @@ -130,7 +136,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // from the base tensor. This method is used by inplace-view ops like // transpose_. It appends a ViewMeta to the existing stack, and refreshes the // tensor by replaying the views off of the alias. - void mutate_view_meta(at::functionalization::ViewMeta meta); + void mutate_view_meta(const at::functionalization::ViewMeta& meta); // Custom implementation of self.set_(src) void set__impl(const FunctionalTensorWrapper* other); @@ -156,7 +162,7 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { // a.replace_(tmp) // // replace_() swaps out the wrapped tensor, value_, with tmp. - void replace_(const Tensor& other); + void replace_(const Tensor& other, bool from_lazy_regenerate = false); bool is_multi_output_view() { return is_multi_output_view_; @@ -211,18 +217,22 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { VariableVersion&& version_counter, bool allow_tensor_metadata_change) const; + void shallow_copy_from(const c10::intrusive_ptr& impl) override; + void copy_tensor_metadata_and_refresh( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change) const; + // Note that value is not taken by reference: internally, the wrapper will // change the value tensor that it points to over time. Tensor value_; - int64_t level_; + int64_t level_{}; // These two counters are used for identifying // whether all the mutations on a given tensor are hidden from autograd or // not. If we have an input mutation that is hidden from autograd, then once // we convert the input mutation to a copy_() we know it will be safe to hide // the copy_() from autograd as well. - uint64_t mutation_counter_ = 0; - uint64_t mutation_hidden_from_autograd_counter_ = 0; - uint64_t mutation_during_no_grad_or_inference_mode_ = 0; bool has_metadata_mutation_ = false; bool is_multi_output_view_ = false; // Did the tensor experience a set_() call. @@ -230,6 +240,13 @@ struct TORCH_API FunctionalTensorWrapper : public c10::TensorImpl { size_t generation_ = 0; std::vector view_metas_; + + protected: + static void copy_tensor_metadata( + const FunctionalTensorWrapper* src_impl, + FunctionalTensorWrapper* dest_impl, + const c10::VariableVersion& version_counter, + bool allow_tensor_metadata_change); }; // Utility functions for the functionalization pass. @@ -310,9 +327,11 @@ Tensor create_functional_tensor_with_view_meta( std::vector create_functional_tensor_with_view_meta( ITensorListRef view_to_wrap, const Tensor& base, - functionalization::ViewMeta meta); + const functionalization::ViewMeta& meta); -void mutate_view_meta(const Tensor& self, functionalization::ViewMeta meta); +void mutate_view_meta( + const Tensor& self, + const functionalization::ViewMeta& meta); void set_sizes_strides_offset(const Tensor& out, const Tensor& meta_out); void set_sizes_strides_offset( diff --git a/aten/src/ATen/FunctionalizeFallbackKernel.cpp b/aten/src/ATen/FunctionalizeFallbackKernel.cpp index 783a925d69833..594f627e17ccf 100644 --- a/aten/src/ATen/FunctionalizeFallbackKernel.cpp +++ b/aten/src/ATen/FunctionalizeFallbackKernel.cpp @@ -30,17 +30,29 @@ namespace { void functionalizeFallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatchKeySet, torch::jit::Stack* stack) { const auto& schema = op.schema(); + // NB: auto_functionalize handles the case where outputs do not have alias info. + // This error message therefore suggests users to modify their custom op to the + // point where auto_functionalize works instead of asking them to try the raw + // functionalization API (because that is a bit difficult to use). + // If you're here and want to try the raw functionalizaton kernel approach, + // see https://gist.github.com/bdhirsh/7dadbf6296f8f7d1abcf4c482f438aaa TORCH_CHECK( !schema.hasAnyAliasInfo(), - "Found a custom (non-ATen) operator that either mutates or its inputs: ", - op.operator_name().name, ".", op.operator_name().overload_name, - ". Getting these operators to work with functionalization requires some extra work", - ". For mutable ops you need to register a corresponding out-of-place variant of the op,", - " and you also need to register a Functionalization kernel that performs some boilerplate,", - " telling functionalization to map from the mutable op to the out-of-place op", - ". See a more complete example of how to do this at ", - "https://gist.github.com/bdhirsh/7dadbf6296f8f7d1abcf4c482f438aaa.", - " Please file a GitHub issue if you run into any problems."); + "Found a custom (non-ATen) operator whose output has alias annotations: ", + op.schema(), + ". We only support functionalizing operators whose outputs do not have alias ", + "annotations (e.g. 'Tensor(a)' is a Tensor with an alias annotation whereas ", + "'Tensor' is a Tensor without. The '(a)' is the alias annotation). " + "The alias annotation specifies that the output ", + "Tensor shares storage with an input that has the same annotation. ", + "Please check if ", + "(1) the output needs to be an output (if not, don't return it), ", + "(2) if the output doesn't share storage with any inputs, then ", + "delete the alias annotation. ", + "(3) if the output indeed shares storage with an input, then add a ", + ".clone() before returning it to prevent storage sharing and then " + "delete the alias annotation. ", + "Otherwise, please file an issue on GitHub."); const auto num_arguments = schema.arguments().size(); const auto arguments_begin = stack->size() - num_arguments; auto arguments = torch::jit::last(stack, num_arguments); @@ -168,7 +180,7 @@ static const at::Tensor & resize__functionalization(c10::DispatchKeySet dispatch return base.as_strided_scatter(mutated_view, size, c10::contiguous_strides(size)); } ); - at::functionalization::impl::mutate_view_meta(self, std::move(view_meta)); + at::functionalization::impl::mutate_view_meta(self, view_meta); return self; } @@ -198,7 +210,13 @@ static at::Tensor lift_fresh_functionalize_copy(const at::Tensor & self) { // but that isn't really a use case today. // Needed for https://github.com/pytorch/pytorch/issues/105327 if (at::functionalization::impl::isFunctionalTensor(self)) { - return self.clone(); + // Note [Composite Functionalization under PreDispatch mode] + // When we are tracing under PreDispatch, PreDispatch key will be + // in the local include TLS. As a result, when we redispatch here, + // we will end up hitting PreDispatch stack first. So, we should + // directly redispatch to the functionalize key manually. + static auto op = c10::Dispatcher::singleton().findSchemaOrThrow("aten::clone", "").typed)>(); + return op.redispatch(c10::DispatchKeySet({c10::DispatchKey::Functionalize}), self, c10::nullopt); } at::AutoDispatchSkipFunctionalize guard; @@ -304,15 +322,15 @@ static at::Tensor& set__functionalize(at::Tensor& self, const at::Tensor& src) { TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(self) || !at::functionalization::impl::isFunctionalTensor(src), "set__functionalize: Tried to mutate a non-functional tensor with a functional tensor, which is not allowed"); - TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(src), - "set__functionalize: We do not currently support x.set_(y) where y is not a FunctionalTensor. Please file an issue"); - // nop case if (!at::functionalization::impl::isFunctionalTensor(self) && !at::functionalization::impl::isFunctionalTensor(src)) { at::AutoDispatchSkipFunctionalize guard; return self.set_(src); } + TORCH_CHECK(at::functionalization::impl::isFunctionalTensor(src), + "set__functionalize: We do not currently support x.set_(y) where y is not a FunctionalTensor. Please file an issue"); + TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(self)); TORCH_INTERNAL_ASSERT(at::functionalization::impl::isFunctionalTensor(src)); auto self_impl = at::functionalization::impl::unsafeGetFunctionalWrapper(self); diff --git a/aten/src/ATen/InferSize.h b/aten/src/ATen/InferSize.h index 111c7eb8f5fc7..caa8ec42003c9 100644 --- a/aten/src/ATen/InferSize.h +++ b/aten/src/ATen/InferSize.h @@ -37,7 +37,8 @@ inline void infer_size_impl( } } - if (numel == newsize || (infer_dim && newsize > 0 && numel % newsize == 0)) { + if (TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, newsize)) || + (infer_dim && newsize > 0 && numel % newsize == 0)) { if (infer_dim) { // We have a degree of freedom here to select the dimension size; follow // NumPy semantics and just bail. However, a nice error message is needed diff --git a/aten/src/ATen/LegacyBatchedTensorImpl.h b/aten/src/ATen/LegacyBatchedTensorImpl.h index 732e252165ca6..098fbf9d6292f 100644 --- a/aten/src/ATen/LegacyBatchedTensorImpl.h +++ b/aten/src/ATen/LegacyBatchedTensorImpl.h @@ -1,7 +1,6 @@ #pragma once #include -#include #include #include diff --git a/aten/src/ATen/LegacyVmapTransforms.cpp b/aten/src/ATen/LegacyVmapTransforms.cpp index ca43993ed7d35..5560f9a0d7963 100644 --- a/aten/src/ATen/LegacyVmapTransforms.cpp +++ b/aten/src/ATen/LegacyVmapTransforms.cpp @@ -135,9 +135,7 @@ static Tensor alignBatchDimsAtFront( const Tensor& self, std::bitset requested_levels, int64_t requested_example_dim) { - Tensor physical_tensor; - std::bitset tensor_levels; - std::tie(physical_tensor, tensor_levels) = getPhysicalTensorAndLevels(self); + auto [physical_tensor, tensor_levels] = getPhysicalTensorAndLevels(self); TORCH_INTERNAL_ASSERT( (tensor_levels | requested_levels) == requested_levels, @@ -263,10 +261,7 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi VmapPhysicalViewVec result; - std::bitset levels; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t largest_logical_dim; - std::tie(levels, largest_logical_dim) = getLevelsAndLargestLogicalDim(logical_tensors); + auto [levels, largest_logical_dim] = getLevelsAndLargestLogicalDim(logical_tensors); for (const auto& tensor : logical_tensors) { // NB: It's possible that we didn't actually need to align `tensor`. diff --git a/aten/src/ATen/LegacyVmapTransforms.h b/aten/src/ATen/LegacyVmapTransforms.h index b32b182056556..97729b3254e74 100644 --- a/aten/src/ATen/LegacyVmapTransforms.h +++ b/aten/src/ATen/LegacyVmapTransforms.h @@ -113,8 +113,8 @@ struct VmapPhysicalToLogicalMap; // levels: 012345 struct TORCH_API VmapPhysicalView { VmapPhysicalView(Tensor&& tensor, std::bitset levels) - : levels_(levels), tensor_(tensor) { - TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor)); + : levels_(levels), tensor_(std::move(tensor)) { + TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor_)); } Tensor& tensor() { diff --git a/aten/src/ATen/MapAllocator.cpp b/aten/src/ATen/MapAllocator.cpp index 497b53d35b048..19c08634d2cf9 100644 --- a/aten/src/ATen/MapAllocator.cpp +++ b/aten/src/ATen/MapAllocator.cpp @@ -63,7 +63,6 @@ constexpr const char* unknown_eventname = "eventname not specified"; MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, size_t size) : filename_(filename.empty() ? unknown_filename : filename) - , flags_(0) // to be filled later , size_(0) // to be filled later #ifdef _WIN32 , handle_(INVALID_HANDLE_VALUE) // to be filled later @@ -72,7 +71,6 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, #else , fd_(fd) #endif - , base_ptr_(nullptr) { if (!(flags & ALLOCATOR_MAPPED_SHARED) && !(flags & ALLOCATOR_MAPPED_SHAREDMEM)) { @@ -252,11 +250,13 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) { if (flags_ & ALLOCATOR_MAPPED_SHARED) { + // NOLINTNEXTLINE(bugprone-assignment-in-if-condition) if ((fd = open(filename_.c_str(), flags, (mode_t)0600)) == -1) { TORCH_CHECK(false, "unable to open file <", filename_, "> in read-write mode: ", strerror(errno), " (", errno, ")"); } } else if (flags_ & ALLOCATOR_MAPPED_SHAREDMEM) { #ifdef HAVE_SHM_OPEN + // NOLINTNEXTLINE(bugprone-assignment-in-if-condition) if((fd = shm_open(filename_.c_str(), flags, (mode_t)0600)) == -1) { TORCH_CHECK(false, "unable to open shared memory object <", filename_, "> in read-write mode: ", strerror(errno), " (", errno, ")"); } @@ -264,6 +264,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, TORCH_CHECK(false, "unable to open file <", filename_, "> in sharedmem mode, shm_open unavailable on this platform"); #endif } else { + // NOLINTNEXTLINE(bugprone-assignment-in-if-condition) if ((fd = open(filename_.c_str(), O_RDONLY)) == -1) { TORCH_CHECK(false, "unable to open file <", filename_, "> in read-only mode: ", strerror(errno), " (", errno, ")"); } @@ -272,7 +273,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, fd = fd_; } - struct stat file_stat; + struct stat file_stat{}; if (fstat(fd, &file_stat) == -1) { int last_err = errno; if (!(flags_ & ALLOCATOR_MAPPED_FROMFD)) { @@ -284,7 +285,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, if (size > 0) { if (static_cast(size) > file_stat.st_size) { if (flags_) { - if (ftruncate(fd, size) == -1) { + if (ftruncate(fd, static_cast(size)) == -1) { TORCH_CHECK(false, "unable to resize file <", filename_, "> to the right size: ", strerror(errno), " (", errno, ")"); } if (fstat(fd, &file_stat) == -1 || file_stat.st_size < static_cast(size)) { @@ -311,7 +312,7 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, size = file_stat.st_size; } - size_ = size; /* if we are here, it must be the right size */ + size_ = static_cast(size); /* if we are here, it must be the right size */ /* map it */ if (flags_ & (ALLOCATOR_MAPPED_SHARED | ALLOCATOR_MAPPED_SHAREDMEM)) { @@ -325,6 +326,11 @@ MapAllocator::MapAllocator(WithFd, c10::string_view filename, int fd, int flags, TORCH_CHECK(false, "unable to mmap ", size_, " bytes from file <", filename_, ">: ", strerror(errno), " (", errno, ")"); } +#if !defined(__APPLE__) && !defined(__ANDROID__) + /* attempt to use larger block size on Linux, which is important for getting better CUDA upload speed */ + posix_fadvise(fd, 0, static_cast(size), POSIX_FADV_SEQUENTIAL); +#endif + if (flags_ & ALLOCATOR_MAPPED_KEEPFD) { fd_ = fd; } else { @@ -601,8 +607,7 @@ void* RefcountedMapAllocator::data() const { } MapAllocator::~MapAllocator() { - // NOLINTNEXTLINE(clang-analyzer-optin.cplusplus.VirtualCall) - close(); + MapAllocator::close(); c10::reportMemoryUsageToProfiler(base_ptr_, -size_, 0, 0, c10::Device(c10::DeviceType::CPU)); } diff --git a/aten/src/ATen/MapAllocator.h b/aten/src/ATen/MapAllocator.h index 3354ab84577f1..f4a30edef6239 100644 --- a/aten/src/ATen/MapAllocator.h +++ b/aten/src/ATen/MapAllocator.h @@ -128,7 +128,7 @@ class TORCH_API RefcountedMapAllocator : private RefcountedMapAllocatorArgCheck, void close() override; ~RefcountedMapAllocator() override { - close(); + RefcountedMapAllocator::close(); } protected: diff --git a/aten/src/ATen/MatrixRef.h b/aten/src/ATen/MatrixRef.h index ba693ab7d5809..901efff4cc23f 100644 --- a/aten/src/ATen/MatrixRef.h +++ b/aten/src/ATen/MatrixRef.h @@ -94,16 +94,16 @@ class MatrixRef { /// The declaration here is extra complicated so that "arrayRef = {}" /// continues to select the move assignment operator. template - typename std::enable_if::value, MatrixRef>::type& - operator=(U&& Temporary) = delete; + std::enable_if_t, MatrixRef>& operator=( + U&& Temporary) = delete; /// Disallow accidental assignment from a temporary. /// /// The declaration here is extra complicated so that "arrayRef = {}" /// continues to select the move assignment operator. template - typename std::enable_if::value, MatrixRef>::type& - operator=(std::initializer_list) = delete; + std::enable_if_t, MatrixRef>& operator=( + std::initializer_list) = delete; }; } // end namespace at diff --git a/aten/src/ATen/NamedTensorUtils.cpp b/aten/src/ATen/NamedTensorUtils.cpp index 7195d04f0f4cd..a76156c03402d 100644 --- a/aten/src/ATen/NamedTensorUtils.cpp +++ b/aten/src/ATen/NamedTensorUtils.cpp @@ -297,7 +297,7 @@ static int64_t num_batch_dims(DimnameList names) { if (names.size() <= 2) { return 0; } - return names.size() - 2; + return static_cast(names.size() - 2); } static std::vector compute_matmul_outnames( diff --git a/aten/src/ATen/NestedTensorImpl.cpp b/aten/src/ATen/NestedTensorImpl.cpp index 223a95ebc132b..2f73b7b304ee3 100644 --- a/aten/src/ATen/NestedTensorImpl.cpp +++ b/aten/src/ATen/NestedTensorImpl.cpp @@ -10,6 +10,7 @@ #include #include +#include namespace { inline void validate_nested_tensor_metadata( @@ -67,8 +68,8 @@ c10::DispatchKeySet get_view_key_set(const at::Tensor& base) { } } // namespace -namespace at { -namespace native { + +namespace at::native { inline std::vector construct_opt_sizes(const at::Tensor& sizes) { // torch.tensor([]) is considered to have `dim() = 1` and `size(0) = 0` @@ -80,7 +81,7 @@ inline std::vector construct_opt_sizes(const at::Tensor& sizes) { std::vector result(1, sizes.sizes()[0]); if (sizes.dim() > 0) { size_t nested_dim = result.size(); - int64_t* sizes_ptr = sizes.data_ptr(); + const int64_t* sizes_ptr = sizes.const_data_ptr(); result.resize(nested_dim + sizes.sizes()[1]); int64_t sizes_size_0 = sizes.sizes()[0]; int64_t sizes_size_1 = sizes.sizes()[1]; @@ -100,7 +101,7 @@ inline std::vector construct_opt_sizes(const at::Tensor& sizes) { } // assume contiguous, we can construct stride from size -inline at::Tensor construct_nested_strides(const at::Tensor& sizes) { +at::Tensor construct_nested_strides(const at::Tensor& sizes) { // empty `sizes` means empty nested tensor, so return empty strides if (sizes.dim() == 0) { return sizes; @@ -113,7 +114,7 @@ inline at::Tensor construct_nested_strides(const at::Tensor& sizes) { return sizes; } at::Tensor strides = sizes.new_empty(sizes.sizes()); - const int64_t* sizes_ptr = sizes.data_ptr(); + const int64_t* sizes_ptr = sizes.const_data_ptr(); int64_t* strides_ptr = strides.data_ptr(); for (int64_t i = 0; i < sizes.size(0); i++) { strides_ptr[orig_dim - 1] = 1; @@ -138,7 +139,7 @@ inline at::Tensor construct_nested_strides(const at::Tensor& sizes) { * * @return A tensor of offsets */ -inline at::Tensor construct_offsets(const at::Tensor& sizes) { +at::Tensor construct_offsets(const at::Tensor& sizes) { // empty `sizes` means empty nested tensor, so return empty strides if (sizes.dim() == 0) { return at::empty({0}, sizes.options().dtype(kLong)); @@ -151,10 +152,10 @@ inline at::Tensor construct_offsets(const at::Tensor& sizes) { std::iota(offsets_ptr, offsets_ptr + ntensors, 0); return offsets; } - const int64_t* sizes_ptr = sizes.data_ptr(); + const int64_t* sizes_ptr = sizes.const_data_ptr(); offsets_ptr[0] = 0; for (const auto i : c10::irange(ntensors - 1)) { - const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies()); + const int64_t row_product = std::accumulate(sizes_ptr, sizes_ptr + orig_dim, 1, std::multiplies()); offsets_ptr[i + 1] = offsets_ptr[i] + row_product; sizes_ptr += orig_dim; } @@ -188,7 +189,7 @@ NestedTensorImpl::NestedTensorImpl( } NestedTensorImpl::NestedTensorImpl( - at::Tensor buffer, + const at::Tensor& buffer, at::Tensor nested_sizes, at::Tensor nested_strides, at::Tensor storage_offsets) @@ -196,9 +197,9 @@ NestedTensorImpl::NestedTensorImpl( buffer.storage(), generate_nested_key_set_from_buffer(buffer), buffer.dtype(), - nested_sizes, - nested_strides, - storage_offsets) { + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)) { TORCH_INTERNAL_ASSERT( buffer.dim() == 1, @@ -210,8 +211,8 @@ NestedTensorImpl::NestedTensorImpl( // assume contiguous, `nested_strides` and `offsets` // can be infered from `nested_sizes` NestedTensorImpl::NestedTensorImpl( - at::Tensor buffer, - at::Tensor nested_sizes) + const at::Tensor& buffer, + const at::Tensor& nested_sizes) : NestedTensorImpl( buffer, nested_sizes, @@ -343,7 +344,7 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) { static_cast(std::numeric_limits::max()), static_cast(std::numeric_limits::max())); - const int64_t* sizes_ptr = tensor.data_ptr(); + const int64_t* sizes_ptr = tensor.const_data_ptr(); const auto nt_dim = tensor.size(1); uint64_t num_elements{0}; @@ -359,5 +360,4 @@ int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor) { return static_cast(num_elements); } -} // namespace native -} // namespace at +} // namespace at::native diff --git a/aten/src/ATen/NestedTensorImpl.h b/aten/src/ATen/NestedTensorImpl.h index 11d7e2f165548..0bd3d98e73c5c 100644 --- a/aten/src/ATen/NestedTensorImpl.h +++ b/aten/src/ATen/NestedTensorImpl.h @@ -14,6 +14,8 @@ namespace at::native { struct NestedTensorImpl; inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt); int64_t get_numel_from_nested_size_tensor(const at::Tensor& tensor); +at::Tensor construct_nested_strides(const at::Tensor& nested_size); +at::Tensor construct_offsets(const at::Tensor& nested_size); struct TORCH_API NestedTensorImpl : public c10::TensorImpl { explicit NestedTensorImpl( @@ -25,13 +27,15 @@ struct TORCH_API NestedTensorImpl : public c10::TensorImpl { at::Tensor storage_offsets); explicit NestedTensorImpl( - at::Tensor buffer, + const at::Tensor& buffer, at::Tensor nested_sizes, at::Tensor nested_strides, at::Tensor storage_offsets); // assume contiguous, `nested_strides` and `offsets` // can be infered from `nested_sizes` - explicit NestedTensorImpl(at::Tensor buffer, at::Tensor nested_sizes); + explicit NestedTensorImpl( + const at::Tensor& buffer, + const at::Tensor& nested_sizes); // This constructor is used creating view tensors from nested tensors explicit NestedTensorImpl( @@ -224,7 +228,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) { } const Tensor &sizemat = nt->get_nested_sizes(), &stridemat = nt->get_nested_strides(); - int64_t* offsets_ptr = nt->get_storage_offsets().data_ptr(); + const int64_t* offsets_ptr = + nt->get_storage_offsets().const_data_ptr(); int64_t orig_dim = sizemat.size(1); // nesting scalars if (orig_dim == 0) { @@ -239,8 +244,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) { // nesting tensors else { // if any underlying tensor is non-contiguous - const int64_t *sizemat_ptr = sizemat.data_ptr(), - *stridemat_ptr = stridemat.data_ptr(); + const int64_t *sizemat_ptr = sizemat.const_data_ptr(), + *stridemat_ptr = stridemat.const_data_ptr(); for (int64_t i = 0; i < ntensors; i++) { if (stridemat_ptr[orig_dim - 1] != 1) { return false; @@ -259,8 +264,8 @@ inline bool nested_tensor_impl_is_contiguous(const NestedTensorImpl* nt) { if (offsets_ptr[0] != 0) { return false; } - sizemat_ptr = sizemat.data_ptr(); - stridemat_ptr = stridemat.data_ptr(); + sizemat_ptr = sizemat.const_data_ptr(); + stridemat_ptr = stridemat.const_data_ptr(); for (int64_t i = 1; i < ntensors; i++) { if (offsets_ptr[i] != offsets_ptr[i - 1] + *sizemat_ptr * *stridemat_ptr) { diff --git a/aten/src/ATen/NumericUtils.h b/aten/src/ATen/NumericUtils.h index 06b25334bb13e..788da64b4e427 100644 --- a/aten/src/ATen/NumericUtils.h +++ b/aten/src/ATen/NumericUtils.h @@ -7,7 +7,9 @@ #include #include #include +#include #include +#include #include #include @@ -20,16 +22,12 @@ namespace at { // (uselessly) convert to floating point and then do the test. // This function is. -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template , int> = 0> inline C10_HOST_DEVICE bool _isnan(T /*val*/) { return false; } -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template , int> = 0> inline C10_HOST_DEVICE bool _isnan(T val) { #if defined(__CUDACC__) || defined(__HIPCC__) return ::isnan(val); @@ -38,24 +36,19 @@ inline C10_HOST_DEVICE bool _isnan(T val) { #endif } -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template ::value, int> = 0> inline C10_HOST_DEVICE bool _isnan(T val) { return std::isnan(val.real()) || std::isnan(val.imag()); } -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template , int> = 0> inline C10_HOST_DEVICE bool _isnan(T val) { return at::_isnan(static_cast(val)); } template < typename T, - typename std::enable_if::value, int>::type = - 0> + std::enable_if_t, int> = 0> inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { return at::_isnan(static_cast(val)); } @@ -66,16 +59,28 @@ inline C10_HOST_DEVICE bool _isnan(at::BFloat16 val) { template < typename T, - typename std::enable_if::value, int>:: - type = 0> + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +template < + typename T, + std::enable_if_t, int> = 0> +inline C10_HOST_DEVICE bool _isnan(T val) { + return val.isnan(); +} + +template < + typename T, + std::enable_if_t, int> = 0> inline C10_HOST_DEVICE bool _isnan(T val) { return val.isnan(); } template < typename T, - typename std::enable_if::value, int>:: - type = 0> + std::enable_if_t, int> = 0> inline C10_HOST_DEVICE bool _isnan(T val) { return val.isnan(); } @@ -84,16 +89,12 @@ inline C10_HOST_DEVICE bool _isnan(T val) { // (uselessly) convert to floating point and then do the test. // This function is. -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template , int> = 0> inline C10_HOST_DEVICE bool _isinf(T /*val*/) { return false; } -template < - typename T, - typename std::enable_if::value, int>::type = 0> +template , int> = 0> inline C10_HOST_DEVICE bool _isinf(T val) { #if defined(__CUDACC__) || defined(__HIPCC__) return ::isinf(val); @@ -118,10 +119,18 @@ inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fn val) { return false; } +inline C10_HOST_DEVICE bool _isinf(at::Float8_e5m2fnuz val) { + return false; +} + +inline C10_HOST_DEVICE bool _isinf(at::Float8_e4m3fnuz val) { + return false; +} + template C10_HOST_DEVICE inline T exp(T x) { static_assert( - !std::is_same::value, + !std::is_same_v, "this template must be used with float or less precise type"); #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) // use __expf fast approximation for peak bandwidth @@ -139,7 +148,7 @@ C10_HOST_DEVICE inline double exp(double x) { template C10_HOST_DEVICE inline T log(T x) { static_assert( - !std::is_same::value, + !std::is_same_v, "this template must be used with float or less precise type"); #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) // use __logf fast approximation for peak bandwidth @@ -157,7 +166,7 @@ C10_HOST_DEVICE inline double log(double x) { template C10_HOST_DEVICE inline T log1p(T x) { static_assert( - !std::is_same::value, + !std::is_same_v, "this template must be used with float or less precise type"); #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) // use __logf fast approximation for peak bandwidth @@ -176,7 +185,7 @@ C10_HOST_DEVICE inline double log1p(double x) { template C10_HOST_DEVICE inline T tan(T x) { static_assert( - !std::is_same::value, + !std::is_same_v, "this template must be used with float or less precise type"); #if defined(__CUDA_ARCH__) || defined(__HIP_ARCH__) // use __tanf fast approximation for peak bandwidth diff --git a/aten/src/ATen/OpMathType.h b/aten/src/ATen/OpMathType.h index ddb2ce71be05f..d00195b07e490 100644 --- a/aten/src/ATen/OpMathType.h +++ b/aten/src/ATen/OpMathType.h @@ -4,7 +4,9 @@ #include #include #include +#include #include +#include #include namespace at { @@ -31,6 +33,14 @@ struct OpMathType { using type = float; }; template <> +struct OpMathType { + using type = float; +}; +template <> +struct OpMathType { + using type = float; +}; +template <> struct OpMathType> { using type = c10::complex; }; diff --git a/aten/src/ATen/OpaqueTensorImpl.h b/aten/src/ATen/OpaqueTensorImpl.h index e6c6413815bbd..f71ae5358f299 100644 --- a/aten/src/ATen/OpaqueTensorImpl.h +++ b/aten/src/ATen/OpaqueTensorImpl.h @@ -33,6 +33,7 @@ struct TORCH_API OpaqueTensorImpl : public TensorImpl { set_custom_sizes_strides(SizesStridesPolicy::CustomStrides); sizes_and_strides_.set_sizes(sizes); refresh_numel(); + // NOLINTNEXTLINE(cppcoreguidelines-prefer-member-initializer) is_non_overlapping_and_dense_ = is_non_overlapping_and_dense; } diff --git a/aten/src/ATen/Parallel-inl.h b/aten/src/ATen/Parallel-inl.h index 62f287fc33c42..a5e682281abe5 100644 --- a/aten/src/ATen/Parallel-inl.h +++ b/aten/src/ATen/Parallel-inl.h @@ -1,6 +1,7 @@ #pragma once #include +#include #include namespace at { @@ -24,13 +25,19 @@ inline void parallel_for( at::get_num_threads() > 1); if (!use_parallel) { internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); f(begin, end); return; } - internal::invoke_parallel(begin, end, grain_size, f); + internal::invoke_parallel( + begin, end, grain_size, [&](int64_t begin, int64_t end) { + c10::ParallelGuard guard(true); + f(begin, end); + }); #else internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); f(begin, end); #endif } @@ -56,6 +63,7 @@ inline scalar_t parallel_reduce( max_threads > 1); if (!use_parallel) { internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); return f(begin, end, ident); } @@ -66,6 +74,7 @@ inline scalar_t parallel_reduce( grain_size, [&](const int64_t my_begin, const int64_t my_end) { const auto tid = at::get_thread_num(); + c10::ParallelGuard guard(true); results[tid] = f(my_begin, my_end, ident); }); @@ -76,6 +85,7 @@ inline scalar_t parallel_reduce( return result; #else internal::ThreadIdGuard tid_guard(0); + c10::ParallelGuard guard(true); return f(begin, end, ident); #endif } diff --git a/aten/src/ATen/ParallelCommon.cpp b/aten/src/ATen/ParallelCommon.cpp index 285713417cb4c..0504a066eef50 100644 --- a/aten/src/ATen/ParallelCommon.cpp +++ b/aten/src/ATen/ParallelCommon.cpp @@ -15,6 +15,10 @@ #include #endif +#if defined(__APPLE__) && defined(__aarch64__) && !defined(C10_MOBILE) +#include +#endif + namespace at { namespace { @@ -46,30 +50,30 @@ std::string get_parallel_info() { std::ostringstream ss; ss << "ATen/Parallel:\n\tat::get_num_threads() : " - << at::get_num_threads() << std::endl; + << at::get_num_threads() << '\n'; ss << "\tat::get_num_interop_threads() : " - << at::get_num_interop_threads() << std::endl; + << at::get_num_interop_threads() << '\n'; - ss << at::get_openmp_version() << std::endl; + ss << at::get_openmp_version() << '\n'; #ifdef _OPENMP - ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << std::endl; + ss << "\tomp_get_max_threads() : " << omp_get_max_threads() << '\n'; #endif - ss << at::get_mkl_version() << std::endl; + ss << at::get_mkl_version() << '\n'; #if AT_MKL_ENABLED() - ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << std::endl; + ss << "\tmkl_get_max_threads() : " << mkl_get_max_threads() << '\n'; #endif - ss << at::get_mkldnn_version() << std::endl; + ss << at::get_mkldnn_version() << '\n'; ss << "std::thread::hardware_concurrency() : " - << std::thread::hardware_concurrency() << std::endl; + << std::thread::hardware_concurrency() << '\n'; - ss << "Environment variables:" << std::endl; + ss << "Environment variables:" << '\n'; ss << "\tOMP_NUM_THREADS : " - << get_env_var("OMP_NUM_THREADS", "[not set]") << std::endl; + << get_env_var("OMP_NUM_THREADS", "[not set]") << '\n'; ss << "\tMKL_NUM_THREADS : " - << get_env_var("MKL_NUM_THREADS", "[not set]") << std::endl; + << get_env_var("MKL_NUM_THREADS", "[not set]") << '\n'; ss << "ATen parallel backend: "; #if AT_PARALLEL_OPENMP @@ -82,7 +86,7 @@ std::string get_parallel_info() { #ifdef C10_MOBILE ss << " [mobile]"; #endif - ss << std::endl; + ss << '\n'; #if AT_EXPERIMENTAL_SINGLE_THREAD_POOL ss << "Experimental: single thread pool" << std::endl; @@ -104,11 +108,23 @@ int intraop_default_num_threads() { #if defined(FBCODE_CAFFE2) && defined(__aarch64__) nthreads = 1; #else +#if defined(__aarch64__) && defined(__APPLE__) + // On Apple Silicon there are efficient and performance core + // Restrict parallel algorithms to performance cores by default + int32_t num_cores = -1; + size_t num_cores_len = sizeof(num_cores); + if (sysctlbyname("hw.perflevel0.physicalcpu", &num_cores, &num_cores_len, nullptr, 0) == 0) { + if (num_cores > 1) { + nthreads = num_cores; + return num_cores; + } + } +#endif nthreads = TaskThreadPoolBase::defaultNumThreads(); #endif } - return nthreads; -#endif + return static_cast(nthreads); +#endif /* !defined(C10_MOBILE) */ } } // namespace at diff --git a/aten/src/ATen/ParallelNative.cpp b/aten/src/ATen/ParallelNative.cpp index 948d1e5921c7a..a2e1992650009 100644 --- a/aten/src/ATen/ParallelNative.cpp +++ b/aten/src/ATen/ParallelNative.cpp @@ -152,7 +152,7 @@ void invoke_parallel( std::atomic_flag err_flag = ATOMIC_FLAG_INIT; std::exception_ptr eptr; std::mutex mutex; - volatile size_t remaining{0}; + std::atomic_size_t remaining{0}; std::condition_variable cv; } state; diff --git a/aten/src/ATen/ParallelOpenMP.h b/aten/src/ATen/ParallelOpenMP.h index b983571f09a2e..84e744ba10b10 100644 --- a/aten/src/ATen/ParallelOpenMP.h +++ b/aten/src/ATen/ParallelOpenMP.h @@ -11,10 +11,8 @@ #include #endif -namespace at { - #ifdef _OPENMP -namespace internal { +namespace at::internal { template inline void invoke_parallel( int64_t begin, @@ -52,7 +50,5 @@ inline void invoke_parallel( std::rethrow_exception(eptr); } } -} // namespace internal +} // namespace at::internal #endif // _OPENMP - -} // namespace at diff --git a/aten/src/ATen/SparseCsrTensorImpl.cpp b/aten/src/ATen/SparseCsrTensorImpl.cpp index ec9ade9695ece..8dc1fd05452a7 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.cpp +++ b/aten/src/ATen/SparseCsrTensorImpl.cpp @@ -55,7 +55,11 @@ SparseCsrTensorImpl::SparseCsrTensorImpl( "to https://github.com/pytorch/pytorch/issues."); TORCH_INTERNAL_ASSERT(((key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kCPU) - || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA)), + || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kCUDA) + || (key_set.has(DispatchKey::SparseCsrMeta) && device().type() == kMeta) + || (key_set.has(DispatchKey::SparseCsrCPU) && device().type() == kMeta) // fake tensor + || (key_set.has(DispatchKey::SparseCsrCUDA) && device().type() == kMeta) // fake tensor + || (key_set.has(DispatchKey::SparseCsrPrivateUse1) && device().type() == kPrivateUse1)), "Inconsistent key_set (=", key_set, ") and device (=", device(), ")"); set_storage_access_should_throw(); @@ -166,9 +170,7 @@ void SparseCsrTensorImpl::resize_as_sparse_compressed_tensor_( src.layout(), ")"); - Tensor compressed_indices; - Tensor plain_indices; - std::tie(compressed_indices, plain_indices) = + auto [compressed_indices, plain_indices] = sparse_csr::getCompressedPlainIndices(src); // reuse self indices storage if (crow_indices_.sizes() != compressed_indices.sizes()) { diff --git a/aten/src/ATen/SparseCsrTensorImpl.h b/aten/src/ATen/SparseCsrTensorImpl.h index c39aeb4c5d82b..94ac1e1c39344 100644 --- a/aten/src/ATen/SparseCsrTensorImpl.h +++ b/aten/src/ATen/SparseCsrTensorImpl.h @@ -2,6 +2,7 @@ #include #include +#include #include namespace at { @@ -107,6 +108,39 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { } } + template + c10::intrusive_ptr shallow_copy_and_detach_core( + VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const { + const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); + c10::impl::PyInterpreter&& interpreter = nullptr; + if (mode_stack_len > 0 && + !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { + const auto& cur_torch_dispatch_mode_state = + c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); + interpreter = cur_torch_dispatch_mode_state->pyinterpreter(); + } else if ( + key_set_.has(DispatchKey::Python) && + !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { + interpreter = pyobj_slot_.load_pyobj_interpreter(); + } else { + // otherwise just copy the SparseTensorImpl and not the PyObject. + auto impl = c10::make_intrusive( + key_set(), device(), layout_impl(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/version_counter, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + auto r = interpreter->detach(this); + r->set_version_counter(std::forward(version_counter)); + r->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + return r; + } + /** * Return a TensorImpl that is a shallow-copy of this TensorImpl. * @@ -116,15 +150,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( const c10::VariableVersion& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive( - key_set(), device(), layout_impl(), dtype()); - copy_tensor_metadata( - /*src_sparse_impl=*/this, - /*dest_sparse_impl=*/impl.get(), - /*version_counter=*/version_counter, - /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); - impl->refresh_numel(); - return impl; + return shallow_copy_and_detach_core( + version_counter, allow_tensor_metadata_change); } /** @@ -136,15 +163,8 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive( - key_set(), device(), layout_impl(), dtype()); - copy_tensor_metadata( - /*src_sparse_impl=*/this, - /*dest_sparse_impl=*/impl.get(), - /*version_counter=*/version_counter, - /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); - impl->refresh_numel(); - return impl; + return shallow_copy_and_detach_core( + std::move(version_counter), allow_tensor_metadata_change); } private: @@ -168,12 +188,12 @@ struct TORCH_API SparseCsrTensorImpl : public TensorImpl { static void copy_tensor_metadata( const SparseCsrTensorImpl* src_sparse_impl, SparseCsrTensorImpl* dest_sparse_impl, - const c10::VariableVersion& version_counter, + c10::VariableVersion version_counter, bool allow_tensor_metadata_change) { TensorImpl::copy_tensor_metadata( src_sparse_impl, dest_sparse_impl, - version_counter, + std::move(version_counter), allow_tensor_metadata_change); // Sparse-specific fields diff --git a/aten/src/ATen/SparseCsrTensorUtils.h b/aten/src/ATen/SparseCsrTensorUtils.h index d3071c27b87da..348978293b8ac 100644 --- a/aten/src/ATen/SparseCsrTensorUtils.h +++ b/aten/src/ATen/SparseCsrTensorUtils.h @@ -137,8 +137,7 @@ AT_DISPATCH_CASE_ALL_TYPES_AND_COMPLEX_AND4( \ kComplexHalf, kHalf, kBool, kBFloat16, __VA_ARGS__)) -namespace at { -namespace sparse_csr { +namespace at::sparse_csr { using SparseCsrTensor = Tensor; @@ -244,22 +243,22 @@ inline std::string plainDimName(Layout layout) { } } -inline int rowDimension(Layout layout, IntArrayRef size) { +inline size_t rowDimension(Layout layout, IntArrayRef size) { return size.size() - (isCompressedRow(layout) ? 2 : 1); } -inline int columnDimension(Layout layout, IntArrayRef size) { +inline size_t columnDimension(Layout layout, IntArrayRef size) { return size.size() - (isCompressedColumn(layout) ? 2 : 1); } -inline int compressedDimension( +inline size_t compressedDimension( Layout layout, IntArrayRef size, size_t dense_ndim = 0) { return size.size() - dense_ndim - (isCompressedRow(layout) ? 2 : 1); } -inline int plainDimension( +inline size_t plainDimension( Layout layout, IntArrayRef size, size_t dense_ndim = 0) { @@ -286,6 +285,21 @@ inline std::pair getCompressedPlainIndices(Tensor const& self) { }); } +inline ScalarType getIndexDtype(Tensor const& self) { + switch (self.layout()) { + case kSparseCsr: + case kSparseBsr: + return self.crow_indices().scalar_type(); + case kSparseCsc: + case kSparseBsc: + return self.ccol_indices().scalar_type(); + case kSparse: + return self._indices().scalar_type(); + default: + return ScalarType::Long; + } +} + inline Layout flip_compressed_layout(Layout layout) { switch (layout) { case kSparseCsr: @@ -335,8 +349,7 @@ inline bool only_sparse_compressed_binary_op_trivial_cases( return true; } if (self.is_same(other)) { - Tensor compressed_indices, plain_indices; - std::tie(compressed_indices, plain_indices) = + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self); static_cast(out.unsafeGetTensorImpl()) ->set_member_tensors( @@ -367,13 +380,12 @@ inline bool only_sparse_compressed_add_trivial_cases( }); } -inline Tensor to_type(Tensor input, ScalarType dtype) { - Tensor compressed_indices, plain_indices; - std::tie(compressed_indices, plain_indices) = +inline Tensor to_type(const Tensor& input, ScalarType dtype) { + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(input); return at::_sparse_compressed_tensor_unsafe( - std::move(compressed_indices), - std::move(plain_indices), + compressed_indices, + plain_indices, std::move(input.values()).to(dtype), input.sizes(), dtype, @@ -388,7 +400,7 @@ inline std::tuple create_acc_buffer( ScalarType type, int64_t nnz = -1) { Tensor new_values, new_values_acc; - constexpr bool need_acc = !std::is_same::value; + constexpr bool need_acc = !std::is_same_v; bool is_integral = at::isIntegralType(type, /*includeBool=*/true); if constexpr (need_acc) { auto acc_dtype = CppTypeToScalarType::value; @@ -411,5 +423,4 @@ inline void copy_from_acc_buffer(Tensor& new_values, Tensor& new_values_acc) { } } -} // namespace sparse_csr -} // namespace at +} // namespace at::sparse_csr diff --git a/aten/src/ATen/SparseTensorImpl.cpp b/aten/src/ATen/SparseTensorImpl.cpp index 36c93b706db86..0c0286f6c7a8c 100644 --- a/aten/src/ATen/SparseTensorImpl.cpp +++ b/aten/src/ATen/SparseTensorImpl.cpp @@ -35,7 +35,6 @@ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::Typ SparseTensorImpl::SparseTensorImpl(at::DispatchKeySet key_set, const caffe2::TypeMeta data_type, at::Tensor indices, at::Tensor values) : TensorImpl(key_set, data_type, values.device()) , sparse_dim_(1) - , dense_dim_(0) , indices_(std::move(indices)) , values_(std::move(values)) { // we proxy to this constructor so we can initialize the device correctly, but really only indices/values of this shape are allowed. diff --git a/aten/src/ATen/SparseTensorImpl.h b/aten/src/ATen/SparseTensorImpl.h index d90734100ca6c..af9cbd28b1c35 100644 --- a/aten/src/ATen/SparseTensorImpl.h +++ b/aten/src/ATen/SparseTensorImpl.h @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -306,6 +307,38 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { const Tensor& indices, const Tensor& values); + template + c10::intrusive_ptr shallow_copy_and_detach_core( + VariableVersion&& version_counter, + bool allow_tensor_metadata_change) const { + const auto mode_stack_len = c10::impl::TorchDispatchModeTLS::stack_len(); + c10::impl::PyInterpreter&& interpreter = nullptr; + if (mode_stack_len > 0 && + !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { + const auto& cur_torch_dispatch_mode_state = + c10::impl::TorchDispatchModeTLS::get_stack_at(mode_stack_len - 1); + interpreter = cur_torch_dispatch_mode_state->pyinterpreter(); + } else if ( + key_set_.has(DispatchKey::Python) && + !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::Python)) { + interpreter = pyobj_slot_.load_pyobj_interpreter(); + } else { + // otherwise just copy the SparseTensorImpl and not the PyObject. + auto impl = c10::make_intrusive(key_set(), dtype()); + copy_tensor_metadata( + /*src_sparse_impl=*/this, + /*dest_sparse_impl=*/impl.get(), + /*version_counter=*/version_counter, + /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); + impl->refresh_numel(); + return impl; + } + auto r = interpreter->detach(this); + r->set_version_counter(std::forward(version_counter)); + r->set_allow_tensor_metadata_change(allow_tensor_metadata_change); + return r; + } + /** * Return a TensorImpl that is a shallow-copy of this TensorImpl. * @@ -315,14 +348,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( const c10::VariableVersion& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive(key_set(), dtype()); - copy_tensor_metadata( - /*src_impl=*/this, - /*dest_impl=*/impl.get(), - /*version_counter=*/version_counter, - /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); - impl->refresh_numel(); - return impl; + return shallow_copy_and_detach_core( + version_counter, allow_tensor_metadata_change); } /** @@ -334,14 +361,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { c10::intrusive_ptr shallow_copy_and_detach( c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const override { - auto impl = c10::make_intrusive(key_set(), dtype()); - copy_tensor_metadata( - /*src_impl=*/this, - /*dest_impl=*/impl.get(), - /*version_counter=*/std::move(version_counter), - /*allow_tensor_metadata_change=*/allow_tensor_metadata_change); - impl->refresh_numel(); - return impl; + return shallow_copy_and_detach_core( + std::move(version_counter), allow_tensor_metadata_change); } /** @@ -354,8 +375,8 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { AT_ASSERT(has_compatible_shallow_copy_type(impl->key_set())); auto sparse_impl = static_cast(impl.get()); copy_tensor_metadata( - /*src_impl=*/sparse_impl, - /*dest_impl=*/this, + /*src_sparse_impl=*/sparse_impl, + /*dest_sparse_impl=*/this, /*version_counter=*/version_counter(), /*allow_tensor_metadata_change=*/allow_tensor_metadata_change()); refresh_numel(); @@ -378,12 +399,12 @@ struct TORCH_API SparseTensorImpl : public TensorImpl { static void copy_tensor_metadata( const SparseTensorImpl* src_sparse_impl, SparseTensorImpl* dest_sparse_impl, - const c10::VariableVersion& version_counter, + c10::VariableVersion version_counter, bool allow_tensor_metadata_change) { TensorImpl::copy_tensor_metadata( src_sparse_impl, dest_sparse_impl, - version_counter, + std::move(version_counter), allow_tensor_metadata_change); // Sparse-specific fields diff --git a/aten/src/ATen/StorageUtils.cpp b/aten/src/ATen/StorageUtils.cpp index df84464fc687e..19c240ed89048 100644 --- a/aten/src/ATen/StorageUtils.cpp +++ b/aten/src/ATen/StorageUtils.cpp @@ -25,10 +25,10 @@ C10_EXPORT void storage_copy( const c10::Storage& src, bool non_blocking) { auto dst_options = c10::TensorOptions().device(dst.device()).dtype(at::kByte); - auto dst_t = at::empty({0}, {}, dst_options).set_(dst); + auto dst_t = at::empty({0}, dst_options).set_(dst); auto src_options = c10::TensorOptions().device(src.device()).dtype(at::kByte); - auto src_t = at::empty({0}, {}, src_options).set_(src); + auto src_t = at::empty({0}, src_options).set_(src); dst_t.copy_(src_t, non_blocking); } diff --git a/aten/src/ATen/TensorGeometry.h b/aten/src/ATen/TensorGeometry.h index b64393e64a770..41f14a15ba99c 100644 --- a/aten/src/ATen/TensorGeometry.h +++ b/aten/src/ATen/TensorGeometry.h @@ -20,7 +20,7 @@ struct TORCH_API TensorGeometry { strides_(sizes.size()), has_symbolic_sizes_strides_( !c10::asIntArrayRefSlowOpt(sizes).has_value()) { - int64_t dim = sizes.size(); + int64_t dim = static_cast(sizes.size()); c10::SymInt expected_stride = 1; for (int64_t i = dim - 1; i >= 0; i--) { strides_[i] = expected_stride; @@ -41,7 +41,7 @@ struct TORCH_API TensorGeometry { bool is_contiguous() const; int64_t dim() const { - return sizes_.size(); + return static_cast(sizes_.size()); } int64_t size(int64_t dim) const { diff --git a/aten/src/ATen/TensorIndexing.h b/aten/src/ATen/TensorIndexing.h index b6e14addb4a7e..eb29b4d5ad739 100644 --- a/aten/src/ATen/TensorIndexing.h +++ b/aten/src/ATen/TensorIndexing.h @@ -24,8 +24,8 @@ namespace at::indexing { -const int64_t INDEX_MIN = c10::SymInt::min_representable_int(); -const int64_t INDEX_MAX = -(INDEX_MIN + 1); +constexpr int64_t INDEX_MIN = c10::SymInt::min_representable_int(); +constexpr int64_t INDEX_MAX = -(INDEX_MIN + 1); enum class TensorIndexType { None, Ellipsis, SymInt, Boolean, Slice, Tensor }; @@ -130,9 +130,7 @@ struct TORCH_API TensorIndex final { TensorIndex(int integer) : TensorIndex(SymInt(integer)) {} // Case 4: Boolean value - template < - class T, - class = typename std::enable_if::value>::type> + template >> TensorIndex(T boolean) : boolean_(boolean), type_(TensorIndexType::Boolean) {} // Case 5: Slice represented in `at::indexing::Slice` form @@ -219,7 +217,8 @@ static inline Tensor applySlice( SymInt length = (self_device == at::kCPU || self_device == at::kCUDA) ? (*self_sizes)[dim] : self.sym_size(dim); - if (!disable_slice_optimization && start == 0 && length == stop && + if (!disable_slice_optimization && + TORCH_GUARD_SIZE_OBLIVIOUS(start.sym_eq(0)) && length == stop && step == 1) { return self; } @@ -273,9 +272,9 @@ static inline Tensor boolToIndexingTensorCPUOrCUDA( // booleans add a dimension of size 1. true indexes this dimension as if 0:, // false as empty. if (value) { - return at::empty({1}, {}, self.options().dtype(kLong)).fill_(0.); + return at::empty({1}, self.options().dtype(kLong)).fill_(0.); } else { - return at::empty({0}, {}, self.options().dtype(kLong)); + return at::empty({0}, self.options().dtype(kLong)); } } @@ -285,9 +284,9 @@ static inline Tensor boolToIndexingTensorNonNativeDeviceType( // booleans add a dimension of size 1. true indexes this dimension as if 0:, // false as empty. if (value) { - return at::zeros({1}, {}, self.options().dtype(kLong)); + return at::zeros({1}, self.options().dtype(kLong)); } else { - return at::empty({0}, {}, self.options().dtype(kLong)); + return at::empty({0}, self.options().dtype(kLong)); } } @@ -318,12 +317,12 @@ static inline void recordTensorIndex( (*dim_ptr)++; }; -static inline c10::List> typeConvertIndices( +static inline c10::List<::std::optional> typeConvertIndices( const Tensor& /*self*/, std::vector&& indices) { - c10::List> converted_inds; + c10::List<::std::optional> converted_inds; converted_inds.reserve(indices.size()); - for (const auto& i : indices) { + for (auto&& i : std::move(indices)) { converted_inds.push_back(std::move(i)); } return converted_inds; @@ -539,9 +538,9 @@ static inline Tensor applySlicing( /*prev_dim_result=*/result, /*original_tensor=*/self, /*index=*/obj, - /*dim=*/&dim, + /*dim_ptr=*/&dim, /*specified_dims_ptr=*/&specified_dims, - /*real_dim=*/i, + /*real_dim=*/static_cast(i), /*outIndices=*/outIndices, /*disable_slice_optimization=*/disable_slice_optimization, /*original_tensor_device=*/self_device, diff --git a/aten/src/ATen/TensorIterator.cpp b/aten/src/ATen/TensorIterator.cpp index 99c8eda122cfc..0afac10d44fbf 100644 --- a/aten/src/ATen/TensorIterator.cpp +++ b/aten/src/ATen/TensorIterator.cpp @@ -52,7 +52,7 @@ inline void get_strides(int64_t* strides, ArrayRef operands, int64_ } // Always at least 2d strides to support 2d for_each loops if (ndim < 2) { - const int64_t ntensors = operands.size(); + auto ntensors = operands.size(); std::fill_n(strides, (2 - ndim) * ntensors, 0); } } @@ -92,7 +92,7 @@ void OperandInfo::tensor(c10::MaybeOwned &&tensor) { void OperandInfo::exchange_tensor(c10::MaybeOwned &&new_tensor) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!original_tensor_base_->defined()); - original_tensor_base_ = std::exchange(tensor_base_, new_tensor); + original_tensor_base_ = std::exchange(tensor_base_, std::move(new_tensor)); *original_tensor_storage_ = std::exchange(*tensor_storage_, make_otr(*tensor_base_)); } @@ -119,6 +119,13 @@ TensorIteratorConfig& TensorIteratorConfig::add_owned_input(const TensorBase& in return *this; } +TensorIteratorConfig& TensorIteratorConfig::add_owned_const_input(const TensorBase& input) { + const_tensor_indices_.push_back(tensors_.size()); + tensors_.push_back(c10::MaybeOwned::owned(std::in_place, input)); + num_inputs_++; + return *this; +} + TensorIteratorConfig& TensorIteratorConfig::add_borrowed_output(const TensorBase& output) { TORCH_INTERNAL_ASSERT( num_inputs_ == 0, @@ -135,6 +142,13 @@ TensorIteratorConfig& TensorIteratorConfig::add_borrowed_input(const TensorBase& return *this; } +TensorIteratorConfig& TensorIteratorConfig::add_borrowed_const_input(const TensorBase& input) { + const_tensor_indices_.push_back(tensors_.size()); + tensors_.push_back(c10::MaybeOwned::borrowed(input)); + num_inputs_++; + return *this; +} + TensorIteratorConfig& TensorIteratorConfig::declare_static_dtype_and_device(ScalarType dtype, Device device) { TORCH_CHECK(!check_all_same_dtype_, "check_all_same_dtype(false) must be called before declare_static_dtype(...)"); static_dtype_ = dtype; @@ -173,6 +187,10 @@ TensorIteratorConfig& TensorIteratorConfig::declare_static_shape(IntArrayRef sha return *this; } +bool TensorIteratorConfig::is_tensor_const(size_t idx) { + return std::find(const_tensor_indices_.begin(), const_tensor_indices_.end(), idx) != const_tensor_indices_.end(); +} + // NOTE: [Computing output strides] // We use the following algorithm to compute output strides // If correctly sized output is provided, we respect its strides and don't change them @@ -531,7 +549,7 @@ void TensorIteratorBase::compute_types(const TensorIteratorConfig& config) { } } -StrideVector TensorIteratorBase::compatible_stride(int element_size) const { +StrideVector TensorIteratorBase::compatible_stride(int64_t element_size) const { auto stride = StrideVector(); int64_t next_stride = element_size; for (const auto dim : c10::irange(ndim())) { @@ -558,8 +576,8 @@ void TensorIteratorBase::allocate_or_resize_outputs() { auto& op = operands_[i]; if (!op.tensor_base().defined() || op.will_resize) { TORCH_INTERNAL_ASSERT(op.is_type_defined(), "no type for operand", i); - int element_size = elementSize(op.target_dtype); - op.stride_bytes = compatible_stride(element_size); + auto element_size = elementSize(op.target_dtype); + op.stride_bytes = compatible_stride(static_cast(element_size)); // check if permutation is just an inverted order bool inverted = true; for (const auto j : c10::irange(ndim())) { @@ -577,7 +595,7 @@ void TensorIteratorBase::allocate_or_resize_outputs() { } else { auto tensor_stride = invert_perm(op.stride_bytes); for (const auto dim : c10::irange(ndim())) { - tensor_stride[dim] /= element_size; + tensor_stride[dim] /= static_cast(element_size); } set_output_raw_strided(i, tensor_shape, tensor_stride, original_options(op), names_); } @@ -757,7 +775,7 @@ void TensorIteratorBase::for_each(loop2d_t loop, int64_t grain_size) { StrideVector TensorIteratorBase::get_strides() const { const auto dim = ndim(); - StrideVector strides(std::max(dim, 2) * ntensors()); + StrideVector strides(static_cast(std::max(dim, 2)) * ntensors()); at::get_strides(strides.data(), operands_, dim); return strides; } @@ -771,7 +789,7 @@ void TensorIteratorBase::serial_for_each(loop2d_t loop, Range range) const { const auto ndim = this->ndim(); c10::SmallBuffer ptrs(ntensors); - c10::SmallBuffer strides(ntensors * std::max(ndim, 2)); + c10::SmallBuffer strides(ntensors * static_cast(std::max(ndim, 2))); at::get_base_ptrs(ptrs.data(), operands_); at::get_strides(strides.data(), operands_, ndim); @@ -795,7 +813,7 @@ bool TensorIteratorBase::is_contiguous() const { } -bool TensorIteratorBase::is_scalar(int arg) const { +bool TensorIteratorBase::is_scalar(int64_t arg) const { const auto& stride = operands_[arg].stride_bytes; for (const auto i : c10::irange(ndim())) { if (stride[i] != 0 && shape_[i] != 1) { @@ -805,7 +823,7 @@ bool TensorIteratorBase::is_scalar(int arg) const { return true; } -bool TensorIteratorBase::is_cpu_scalar(int arg) const { +bool TensorIteratorBase::is_cpu_scalar(int64_t arg) const { return is_scalar(arg) && device(arg).is_cpu(); } @@ -817,7 +835,7 @@ void TensorIteratorBase::cast_outputs() { // and tensor, this condition should no longer ever be true const auto &original_tensor = op.original_tensor(); const auto &tensor = op.tensor(); - if (original_tensor.sizes() != tensor.sizes()){ + if (original_tensor.sizes() != tensor.sizes()) { original_tensor.resize_as_(tensor).as_strided_(tensor.sizes(), tensor.strides()); } original_tensor.copy_(tensor); @@ -826,15 +844,15 @@ void TensorIteratorBase::cast_outputs() { } } -void* TensorIteratorBase::data_ptr(int arg) const { +void* TensorIteratorBase::data_ptr(int64_t arg) const { return operands_[arg].data; } -void TensorIteratorBase::remove_operand(int arg) { +void TensorIteratorBase::remove_operand(int64_t arg) { operands_.erase(operands_.begin() + arg); } -void TensorIteratorBase::unsafe_replace_operand(int arg, void* data) { +void TensorIteratorBase::unsafe_replace_operand(int64_t arg, void* data) { operands_[arg].data = data; } @@ -874,16 +892,16 @@ void TensorIteratorBase::build_binary_float_op( const TensorBase& out, const TensorBase& a, const TensorBase& b) { build(BINARY_FLOAT_OP_CONFIG() .add_owned_output(out) - .add_owned_input(a) - .add_owned_input(b)); + .add_owned_const_input(a) + .add_owned_const_input(b)); } void TensorIteratorBase::build_borrowing_binary_float_op( const TensorBase& out, const TensorBase& a, const TensorBase& b) { build(BINARY_FLOAT_OP_CONFIG() .add_output(out) - .add_input(a) - .add_input(b)); + .add_const_input(a) + .add_const_input(b)); } static void set_up_comparison_op_config(TensorIteratorConfig& config, const TensorBase& out) { @@ -916,8 +934,8 @@ void TensorIteratorBase::build_comparison_op( set_up_comparison_op_config(config, out); config.add_owned_output(out); - config.add_owned_input(a); - config.add_owned_input(b); + config.add_owned_const_input(a); + config.add_owned_const_input(b); build(config); } @@ -927,8 +945,8 @@ void TensorIteratorBase::build_borrowing_comparison_op( set_up_comparison_op_config(config, out); config.add_borrowed_output(out); - config.add_borrowed_input(a); - config.add_borrowed_input(b); + config.add_borrowed_const_input(a); + config.add_borrowed_const_input(b); build(config); } @@ -938,8 +956,8 @@ void TensorIteratorBase::build_borrowing_except_last_argument_comparison_op( set_up_comparison_op_config(config, out); config.add_borrowed_output(out); - config.add_borrowed_input(a); - config.add_owned_input(b); + config.add_borrowed_const_input(a); + config.add_owned_const_input(b); build(config); } @@ -951,9 +969,9 @@ void TensorIteratorBase::build_ternary_op( .cast_common_dtype_to_outputs(true) .enforce_safe_casting_to_output(true) .add_owned_output(out) - .add_owned_input(a) - .add_owned_input(b) - .add_owned_input(c)); + .add_owned_const_input(a) + .add_owned_const_input(b) + .add_owned_const_input(c)); } // This cannot be a function because TensorIteratorConfig is not @@ -969,16 +987,16 @@ void TensorIteratorBase::build_ternary_op( void TensorIteratorBase::build_binary_op(const TensorBase& out, const TensorBase& a, const TensorBase& b) { build(BINARY_OP_CONFIG() .add_owned_output(out) - .add_owned_input(a) - .add_owned_input(b)); + .add_owned_const_input(a) + .add_owned_const_input(b)); } void TensorIteratorBase::build_borrowing_binary_op( const TensorBase& out, const TensorBase& a, const TensorBase& b) { build(BINARY_OP_CONFIG() .add_output(out) - .add_input(a) - .add_input(b)); + .add_const_input(a) + .add_const_input(b)); } // This cannot be a function because TensorIteratorConfig is not @@ -994,13 +1012,13 @@ void TensorIteratorBase::build_borrowing_binary_op( void TensorIteratorBase::build_unary_float_op(const TensorBase& out, const TensorBase& a) { build(UNARY_FLOAT_OP_CONFIG() .add_owned_output(out) - .add_owned_input(a)); + .add_owned_const_input(a)); } void TensorIteratorBase::build_borrowing_unary_float_op(const TensorBase& out, const TensorBase& a) { build(UNARY_FLOAT_OP_CONFIG() .add_output(out) - .add_input(a)); + .add_const_input(a)); } // This cannot be a function because TensorIteratorConfig is not @@ -1015,19 +1033,19 @@ void TensorIteratorBase::build_borrowing_unary_float_op(const TensorBase& out, c void TensorIteratorBase::build_unary_op(const TensorBase& out, const TensorBase& a) { build(UNARY_OP_CONFIG() .add_owned_output(out) - .add_owned_input(a)); + .add_owned_const_input(a)); } void TensorIteratorBase::build_borrowing_unary_op(const TensorBase& out, const TensorBase& a) { build(UNARY_OP_CONFIG() .add_output(out) - .add_input(a)); + .add_const_input(a)); } void TensorIteratorBase::build_output_borrowing_argument_owning_unary_op(const TensorBase& out, const TensorBase& a) { build(UNARY_OP_CONFIG() .add_output(out) - .add_owned_input(a)); + .add_owned_const_input(a)); } // Helper to construct a unary op that forcibly promotes output to boolean. @@ -1039,7 +1057,7 @@ void TensorIteratorBase::build_borrowing_unary_force_boolean_op(const TensorBase .declare_static_dtype(at::kBool) .declare_static_device(a.device()) .add_output(out) - .add_input(a)); + .add_const_input(a)); } TensorIterator TensorIterator::binary_op(TensorBase& out, const TensorBase& a, const TensorBase& b) { @@ -1104,7 +1122,7 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out, const TensorBase& a) { return TensorIteratorConfig() .set_check_mem_overlap(false) .add_owned_output(out) - .add_owned_input(a) + .add_owned_const_input(a) .resize_outputs(false) .is_reduction(true) // TODO: not supporting casting to outputs is only really necessary for arg{min,max} @@ -1128,7 +1146,7 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out1, TensorBase& out2, con .set_check_mem_overlap(false) .add_owned_output(out1) .add_owned_output(out2) - .add_owned_input(a) + .add_owned_const_input(a) .resize_outputs(false) .is_reduction(true) .check_all_same_dtype(false) @@ -1136,7 +1154,8 @@ TensorIterator TensorIterator::reduce_op(TensorBase& out1, TensorBase& out2, con } void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { - for (auto& tensor: config.tensors_) { + for (const auto idx : c10::irange(config.tensors_.size())) { + auto& tensor = config.tensors_[idx]; // If *any* of the arguments is a meta tensor, the overall // computation is a meta computation (don't do any work, // just compute output information). This aligns with @@ -1145,6 +1164,7 @@ void TensorIteratorBase::populate_operands(TensorIteratorConfig& config) { is_meta_ = true; } operands_.emplace_back(std::move(tensor)); + operands_[idx].is_const = config.is_tensor_const(idx); } num_outputs_ = config.num_outputs_; } @@ -1176,6 +1196,9 @@ void TensorIteratorBase::mark_resize_outputs(const TensorIteratorConfig& config) } for (const auto i : c10::irange(num_outputs_)) { const auto& output = tensor(i); + if (!output.defined()) { + operands_[i].will_resize = true; + } if (output.defined() && !output.sizes().equals(shape_)) { if (config.resize_outputs_ && !operands_[i].is_read_write) { operands_[i].will_resize = true; @@ -1507,18 +1530,23 @@ void TensorIteratorBase::build(TensorIteratorConfig& config) { // XLA and lazy tensors don't have storage, so they don't have an underlying data pointer. // Nothing beyond this point is important for meta functions, so it's fine to exit early here. - // Extend the condition to ORT tesnors as ORT tensors also don't have storage. + // Extend the condition to MAIA tesnors as MAIA tensors also don't have storage. if (privateuse1_without_storage || common_device_.type() == DeviceType::MTIA || common_device_.type() == DeviceType::XLA || common_device_.type() == DeviceType::IPU || common_device_.type() == DeviceType::Lazy || - common_device_.type() == DeviceType::ORT || + common_device_.type() == DeviceType::MAIA || common_device_.type() == DeviceType::HPU) return; for (auto& op : operands_) { TORCH_INTERNAL_ASSERT(op.tensor_base().defined()); - op.data = op.tensor_base().data_ptr(); + if (op.is_const) { + // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) + op.data = const_cast(op.tensor_base().const_data_ptr()); + } else { + op.data = op.tensor_base().mutable_data_ptr(); + } } // zero out offsets @@ -1654,7 +1682,7 @@ SplitUntil32Bit::iterator& SplitUntil32Bit::iterator::operator++() { vec.pop_back(); while (!vec.empty() && !vec.back()->can_use_32bit_indexing()) { auto& iter = *vec.back(); - int64_t split_dim = iter.get_dim_to_split(); + auto split_dim = iter.get_dim_to_split(); vec.emplace_back(iter.split(split_dim)); } return *this; @@ -1683,7 +1711,7 @@ DimCounter::DimCounter(IntArrayRef shape, Range range) } int64_t linear_offset = range.begin; - int64_t ndim = values.size(); + auto ndim = values.size(); for (const auto dim : c10::irange(ndim)) { int64_t size = shape[dim]; if (size > 0) { @@ -1700,9 +1728,9 @@ bool DimCounter::is_done() const { void DimCounter::increment(const std::array& step) { offset += step[0] * step[1]; - int64_t ndim = values.size(); + auto ndim = values.size(); int64_t overflow = step[0]; - int i = 0; + size_t i = 0; if (step[1] != 1) { TORCH_INTERNAL_ASSERT(step[0] == shape[0] && values[0] == 0); i = 1; @@ -1719,7 +1747,7 @@ void DimCounter::increment(const std::array& step) { } else { overflow = 0; } - values[i] = value; + values[i] = static_cast(value); } TORCH_INTERNAL_ASSERT(overflow == 0 || overflow == 1); } diff --git a/aten/src/ATen/TensorIterator.h b/aten/src/ATen/TensorIterator.h index f34ffad3f3b43..a241244a5744c 100644 --- a/aten/src/ATen/TensorIterator.h +++ b/aten/src/ATen/TensorIterator.h @@ -79,7 +79,7 @@ constexpr int64_t GRAIN_SIZE = 32768; // Storage for a non-owning Tensor, without needing to include Tensor.h class TORCH_API OpaqueOptionalTensorRef { - alignas(alignof(TensorBase)) std::array data_; + alignas(alignof(TensorBase)) std::array data_{}; public: OpaqueOptionalTensorRef(); @@ -167,10 +167,23 @@ struct TORCH_API OperandInfo { bool is_output = false; + // will_resize is only for output tensor. + // 1) Functional call(like torch.add(self, other)): output tensor is + // undefined, and pytorch creates a new tensor by using common shape + // and computed stride in TensorIterator; + // 2) Inplace call(like torch.add_(self, other)): output tensor is same + // with input tensor, and can't to modify tensor's size and stride; + // 3) Op call with output(like torch.add(self, other, out = output)): + // output tensor is defined, but tensor shape maybe different with common + // shape. If tensor shape is not same with common shape, this output + // tensor will be resized by using common shape and computed stride in + // TensorIterator. Otherwise can't modify tensor's size and stride. bool will_resize = false; bool is_read_write = false; + bool is_const = false; + void validate() { TORCH_CHECK( !tensor_base_->defined() || tensor_base_->layout() == kStrided, @@ -291,11 +304,11 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { bool is_dim_reduced(int dim) const; /// Accessors for each operand - IntArrayRef strides(int arg) const { + IntArrayRef strides(int64_t arg) const { return operands_[arg].stride_bytes; } - void* data_ptr(int arg) const; - ScalarType dtype(int arg = 0) const { + void* data_ptr(int64_t arg) const; + ScalarType dtype(int64_t arg = 0) const { return operands_[arg].current_dtype; } ScalarType common_dtype() const { @@ -304,43 +317,43 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { "Queried for invalid common dtype!"); return common_dtype_; } - ScalarType input_dtype(int arg = 0) const { + ScalarType input_dtype(int64_t arg = 0) const { return operands_[num_outputs_ + arg].current_dtype; } - Device device(int arg = 0) const { + Device device(int64_t arg = 0) const { return operands_[arg].device.value(); } - c10::DeviceType device_type(int arg = 0) const { + c10::DeviceType device_type(int64_t arg = 0) const { return device(arg).type(); } - int64_t element_size(int arg) const { + int64_t element_size(int64_t arg) const { return static_cast(elementSize(dtype(arg))); } - bool is_scalar(int arg) const; - bool is_cpu_scalar(int arg) const; + bool is_scalar(int64_t arg) const; + bool is_cpu_scalar(int64_t arg) const; - const TensorBase& tensor_base(int arg) const { + const TensorBase& tensor_base(int64_t arg) const { return operands_[arg].tensor_base(); } - const Tensor& tensor(int arg) const { + const Tensor& tensor(int64_t arg) const { return operands_[arg].tensor(); } - const TensorBase& output_base(int arg = 0) const { + const TensorBase& output_base(int64_t arg = 0) const { AT_ASSERT(arg < num_outputs_); return tensor_base(arg); } - const Tensor& output(int arg = 0) const { + const Tensor& output(int64_t arg = 0) const { AT_ASSERT(arg < num_outputs_); return tensor(arg); } - const TensorBase& input_base(int arg = 0) const { + const TensorBase& input_base(int64_t arg = 0) const { AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_); return tensor_base(num_outputs_ + arg); } - const Tensor& input(int arg = 0) const { + const Tensor& input(int64_t arg = 0) const { AT_ASSERT(arg >= 0 && arg < ntensors() - num_outputs_); return tensor(num_outputs_ + arg); } @@ -350,7 +363,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { void cast_outputs(); /// Removes an operand from this iterator - void remove_operand(int arg); + void remove_operand(int64_t arg); /// Shrinks an iterated dimension void narrow(int dim, int64_t start, int64_t size); /// Narrows every dim after and including `start_dim` to size one. @@ -358,7 +371,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// Replaces the data pointer for the operand at index `arg`. /// The new pointer should have the same sizes, strides and dtype as the /// original - void unsafe_replace_operand(int arg, void* data); + void unsafe_replace_operand(int64_t arg, void* data); /// Splits this TensorIterator into two iterators. Together they iterate over /// the entire operation. Used by `with_32bit_indexing()`. @@ -368,7 +381,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { int get_dim_to_split() const; template - T scalar_value(int arg) { + T scalar_value(int64_t arg) { auto& op = operands_[arg]; return c10::fetch_and_cast(op.tensor_base().scalar_type(), op.data); } @@ -378,13 +391,14 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// If the scalar is aleady given in the type of Half, then return scalar /// value from tensor_base. template - T original_scalar_value(int arg) { + T original_scalar_value(int64_t arg) { auto& original_tensor_base = operands_[arg].original_tensor_base(); if (original_tensor_base.defined()) { TORCH_INTERNAL_ASSERT( original_tensor_base.scalar_type() != common_dtype()); return c10::fetch_and_cast( - original_tensor_base.scalar_type(), original_tensor_base.data_ptr()); + original_tensor_base.scalar_type(), + original_tensor_base.const_data_ptr()); } else { return scalar_value(arg); } @@ -413,10 +427,10 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { template < typename loop1d_t, std::enable_if_t< - std::is_convertible< + std::is_convertible_v< loop1d_t, c10::function_ref< - void(char**, const int64_t* strides, int64_t size)>>::value, + void(char**, const int64_t* strides, int64_t size)>>, int> = 0> void for_each(loop1d_t loop, int64_t grain_size = at::internal::GRAIN_SIZE) { for_each(loop_2d_from_1d(loop), grain_size); @@ -429,10 +443,10 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { template < typename loop1d_t, std::enable_if_t< - std::is_convertible< + std::is_convertible_v< loop1d_t, c10::function_ref< - void(char**, const int64_t* strides, int64_t size)>>::value, + void(char**, const int64_t* strides, int64_t size)>>, int> = 0> void serial_for_each(loop1d_t loop, Range range) { serial_for_each(loop_2d_from_1d(loop), range); @@ -443,7 +457,7 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { /// Create a strides array for a Tensor with shape of this iterator. The /// parameter `element_size` specifies the size of Tensor's data type in /// bytes (e.g. `4` for `float`) - StrideVector compatible_stride(int element_size) const; + StrideVector compatible_stride(int64_t element_size) const; /// Inverts the re-ordering done by reorder_dimensions. This can only be /// called *before* coalesce_dimensions() is called. @@ -462,13 +476,28 @@ struct TORCH_API TensorIteratorBase : public impl::MetaBase { PtrVector get_base_ptrs() const; // Helper functions for advanced stride manipulations (e.g. torch.flip) - void _unsafe_set_arg_strides(const int arg, IntArrayRef strides) { + void _unsafe_set_arg_strides(const int64_t arg, IntArrayRef strides) { operands_[arg].stride_bytes = strides; } - void _unsafe_set_arg_data(const int arg, void* data) { + void _unsafe_set_arg_data(const int64_t arg, void* data) { operands_[arg].data = data; } + // Helper functions for custom device, custom device can get OperandInfo and + // NameVector in their side. + const OperandInfo& operand(int arg = 0) const { + return operands_[arg]; + } + OperandInfo& operand(int arg = 0) { + return operands_[arg]; + } + NameVector& get_dim_names() { + return names_; + } + const NameVector& get_dim_names() const { + return names_; + } + /// true if the stride computation can use 32-bit arithmetic. Used by GPU /// kernels bool can_use_32bit_indexing() const; @@ -769,10 +798,14 @@ class TORCH_API TensorIteratorConfig final { TensorIteratorConfig& add_input(const TensorBase& input) { return add_borrowed_input(input); } + TensorIteratorConfig& add_const_input(const TensorBase& input) { + return add_borrowed_const_input(input); + } // Borrowing from temporaries is unlikely to go well. TensorIteratorConfig& add_output(TensorBase&& output) = delete; TensorIteratorConfig& add_input(TensorBase&& input) = delete; + TensorIteratorConfig& add_const_input(TensorBase&& input) = delete; // Stores input/output Tensors while incrementing the reference count. // Note that add_{in,out}put are nearly always what you @@ -780,6 +813,7 @@ class TORCH_API TensorIteratorConfig final { // compile. TensorIteratorConfig& add_owned_output(const TensorBase& output); TensorIteratorConfig& add_owned_input(const TensorBase& input); + TensorIteratorConfig& add_owned_const_input(const TensorBase& input); // Advanced API: stores input/output Tensors without incrementing // the reference count. The caller must ensure that these Tensors @@ -788,10 +822,12 @@ class TORCH_API TensorIteratorConfig final { // Important: the outputs have to be added before the inputs. TensorIteratorConfig& add_borrowed_output(const TensorBase& output); TensorIteratorConfig& add_borrowed_input(const TensorBase& input); + TensorIteratorConfig& add_borrowed_const_input(const TensorBase& input); // Borrowing from temporaries is unlikely to go well. TensorIteratorConfig& add_borrowed_output(TensorBase&& output) = delete; TensorIteratorConfig& add_borrowed_input(TensorBase&& input) = delete; + TensorIteratorConfig& add_borrowed_const_input(TensorBase&& input) = delete; // Sets the check_mem_overlap_ flag, which is true by default. // If true, inputs are checked for partial overlap with the outputs and @@ -929,6 +965,8 @@ class TORCH_API TensorIteratorConfig final { } private: + bool is_tensor_const(size_t idx); + SmallVector, 4> tensors_; int num_outputs_ = 0; int num_inputs_ = 0; @@ -947,6 +985,8 @@ class TORCH_API TensorIteratorConfig final { bool promote_inputs_to_common_dtype_ = false; bool promote_integer_inputs_to_float_ = false; bool cast_common_dtype_to_outputs_ = false; + + SmallVector const_tensor_indices_; }; /// A container-like struct that acts as if it contains splits of a @@ -981,6 +1021,7 @@ struct TORCH_API SplitUntil32Bit { iterator end() const; private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const TensorIteratorBase& iter; }; diff --git a/aten/src/ATen/TensorIteratorInternal.h b/aten/src/ATen/TensorIteratorInternal.h index 1b4d4963b8638..ec0cb6c8fdfcb 100644 --- a/aten/src/ATen/TensorIteratorInternal.h +++ b/aten/src/ATen/TensorIteratorInternal.h @@ -25,8 +25,8 @@ inline void get_data_ptrs( ArrayRef base, IntArrayRef strides, IntArrayRef counter) { - const int64_t ntensors = base.size(); - const int64_t ndim = counter.size(); + const auto ntensors = base.size(); + const auto ndim = counter.size(); std::copy(base.begin(), base.end(), ptrs); for (const auto dim : c10::irange(ndim)) { int64_t value = counter[dim]; diff --git a/aten/src/ATen/TensorNames.cpp b/aten/src/ATen/TensorNames.cpp index fe7165816eb87..bff12aa8de65f 100644 --- a/aten/src/ATen/TensorNames.cpp +++ b/aten/src/ATen/TensorNames.cpp @@ -53,8 +53,9 @@ TensorNames::TensorNames(ArrayRef names) { } TensorNames::TensorNames(ArrayRef names, int64_t start, int64_t end) { - start = maybe_wrap_dim(start, names.size()); - end = maybe_wrap_dim(end, names.size()); + int64_t names_size = static_cast(names.size()); + start = maybe_wrap_dim(start, names_size); + end = maybe_wrap_dim(end, names_size); names_.reserve(end - start); for (const auto idx : c10::irange(start, end)) { names_.emplace_back(names, idx); @@ -83,7 +84,7 @@ TensorNames& TensorNames::unifyFromRightInplace(const TensorNames& other, const return *this; } -void TensorNames::append(TensorName&& name) { +void TensorNames::append(TensorName name) { names_.emplace_back(name); } diff --git a/aten/src/ATen/TensorNames.h b/aten/src/ATen/TensorNames.h index 4ec3d064867fb..616efc14d2599 100644 --- a/aten/src/ATen/TensorNames.h +++ b/aten/src/ATen/TensorNames.h @@ -63,11 +63,11 @@ struct TORCH_API TensorNames { const char* op_name = "unify"); void checkUnique(const char* op_name) const; - void append(TensorName&& name); + void append(TensorName name); std::vector toDimnameVec() const; private: - explicit TensorNames(TensorNameVec&& names) : names_(names){}; + explicit TensorNames(TensorNameVec&& names) : names_(std::move(names)){}; TensorNameVec names_; }; diff --git a/aten/src/ATen/TensorOperators.h b/aten/src/ATen/TensorOperators.h index feaad09438a80..7567af4cbfe46 100644 --- a/aten/src/ATen/TensorOperators.h +++ b/aten/src/ATen/TensorOperators.h @@ -9,9 +9,6 @@ #include #endif -#include -#include - namespace at { #define AT_FORALL_BINARY_OPS(_) \ diff --git a/aten/src/ATen/TensorSubclassLikeUtils.h b/aten/src/ATen/TensorSubclassLikeUtils.h index 44b4223245903..a9a0b4ecdcf8b 100644 --- a/aten/src/ATen/TensorSubclassLikeUtils.h +++ b/aten/src/ATen/TensorSubclassLikeUtils.h @@ -43,8 +43,7 @@ constexpr auto kTensorSubclassLike = // no matter the backend component DispatchKey::Batched, DispatchKey::Sparse, - DispatchKey::SparseCsrCPU, - DispatchKey::SparseCsrCUDA, + DispatchKey::SparseCsr, DispatchKey::Python}) | DispatchKeySet(BackendComponent::MetaBit); diff --git a/aten/src/ATen/TensorUtils.cpp b/aten/src/ATen/TensorUtils.cpp index 6e1ce8166cb77..e425a0a8ed130 100644 --- a/aten/src/ATen/TensorUtils.cpp +++ b/aten/src/ATen/TensorUtils.cpp @@ -9,7 +9,7 @@ namespace at { -std::ostream& operator<<(std::ostream & out, TensorGeometryArg t) { +std::ostream& operator<<(std::ostream & out, const TensorGeometryArg& t) { if (t.pos == 0) { // 0 is distinguished; it usually indicates 'self' or the return // tensor @@ -68,7 +68,7 @@ void checkAllContiguous(CheckedFrom c, at::ArrayRef ts) { } void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) { - checkDim(c, t, sizes.size()); + checkDim(c, t, static_cast(sizes.size())); TORCH_CHECK( t->sizes().equals(sizes), "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(), @@ -76,7 +76,7 @@ void checkSize(CheckedFrom c, const TensorGeometryArg& t, IntArrayRef sizes) { } void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, c10::SymIntArrayRef sizes) { - checkDim(c, t, sizes.size()); + checkDim(c, t, static_cast(sizes.size())); TORCH_CHECK( t->sym_sizes().equals(sizes), "Expected tensor of size ", sizes, ", but got tensor of size ", t->sizes(), @@ -91,7 +91,7 @@ void checkSize(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, int64_t s " (while checking arguments for ", c, ")"); } -void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, c10::SymInt size) { +void checkSize_symint(CheckedFrom c, const TensorGeometryArg& t, int64_t dim, const c10::SymInt& size) { TORCH_CHECK( t->sym_size(dim) == size, "Expected tensor to have size ", size, " at dimension ", dim, @@ -343,12 +343,13 @@ inline c10::optional computeStride_impl( // This could perhaps be combined with the below code, but the complexity // didn't seem worth it. const Numel numel = c10::multiply_integers(oldshape); - if (numel == 0 && oldshape.equals(newshape)) { + bool zero_numel = TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(numel, 0)); + if (zero_numel && oldshape.equals(newshape)) { return toResult(oldstride); } ResultVec newstride(newshape.size()); - if (numel == 0) { + if (zero_numel) { for (int64_t view_d = newshape.size() - 1; view_d >= 0; view_d--) { if (view_d == (int64_t)(newshape.size() - 1)) { newstride[view_d] = 1; @@ -370,10 +371,10 @@ inline c10::optional computeStride_impl( tensor_numel *= oldshape[tensor_d]; // if end of tensor size chunk, check view if ((tensor_d == 0) || - (oldshape[tensor_d - 1] != 1 && + (TORCH_GUARD_SIZE_OBLIVIOUS(sym_ne(oldshape[tensor_d - 1], 1)) && oldstride[tensor_d - 1] != tensor_numel * chunk_base_stride)) { while (view_d >= 0 && - (view_numel < tensor_numel || newshape[view_d] == 1)) { + (TORCH_GUARD_SIZE_OBLIVIOUS(sym_lt(view_numel, tensor_numel)) || TORCH_GUARD_SIZE_OBLIVIOUS(sym_eq(newshape[view_d], 1)))) { newstride[view_d] = view_numel * chunk_base_stride; view_numel *= newshape[view_d]; view_d--; diff --git a/aten/src/ATen/TensorUtils.h b/aten/src/ATen/TensorUtils.h index 4a95e622257e7..4615ab50606ee 100644 --- a/aten/src/ATen/TensorUtils.h +++ b/aten/src/ATen/TensorUtils.h @@ -20,12 +20,14 @@ namespace at { // which do NO argument checking by default. struct TORCH_API TensorArg { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const Tensor& tensor; const char* name; int pos; // 1-indexed TensorArg(const Tensor& tensor, const char* name, int pos) : tensor(tensor), name(name), pos(pos) {} // Try to mitigate any possibility of dangling reference to temporaries. + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) TensorArg(Tensor&& tensor, const char* name, int pos) = delete; const Tensor* operator->() const { return &tensor; @@ -66,7 +68,9 @@ using CheckedFrom = const char*; // not TensorGeometryArg, because the Tensor to TensorGeometry // conversion will blow up if you have undefined tensors. -TORCH_API std::ostream& operator<<(std::ostream& out, TensorGeometryArg t); +TORCH_API std::ostream& operator<<( + std::ostream& out, + const TensorGeometryArg& t); TORCH_API void checkDim( CheckedFrom c, const Tensor& tensor, @@ -103,7 +107,7 @@ TORCH_API void checkSize_symint( CheckedFrom c, const TensorGeometryArg& t, int64_t dim, - c10::SymInt size); + const c10::SymInt& size); TORCH_API void checkNumel( CheckedFrom c, const TensorGeometryArg& t, diff --git a/aten/src/ATen/ThreadLocalState.h b/aten/src/ATen/ThreadLocalState.h index 7cae9997ab05a..8419499c3a563 100644 --- a/aten/src/ATen/ThreadLocalState.h +++ b/aten/src/ATen/ThreadLocalState.h @@ -1,7 +1,5 @@ #pragma once -#include - #include #include #include @@ -98,6 +96,7 @@ class TORCH_API ThreadLocalStateGuard { } private: + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const ThreadLocalState prev_state_; }; diff --git a/aten/src/ATen/Version.cpp b/aten/src/ATen/Version.cpp index eb71fe315d430..cf33d89e0814e 100644 --- a/aten/src/ATen/Version.cpp +++ b/aten/src/ATen/Version.cpp @@ -190,8 +190,8 @@ std::string show_config() { ss << detail::getCUDAHooks().showConfig(); } - if (hasORT()) { - ss << detail::getORTHooks().showConfig(); + if (hasMAIA()) { + ss << detail::getMAIAHooks().showConfig(); } if (hasXPU()) { diff --git a/aten/src/ATen/WrapDimUtils.h b/aten/src/ATen/WrapDimUtils.h index 142665b7c8b27..8b1ad3026cd04 100644 --- a/aten/src/ATen/WrapDimUtils.h +++ b/aten/src/ATen/WrapDimUtils.h @@ -104,37 +104,40 @@ inline void maybe_wrap_dims( // dimension behavior and dimension size checking). We maintain this behavior // for backwards compatibility, but only for this specific size (i.e. other // empty sizes are not skipped). -template -inline int64_t _legacy_cat_wrap_dim( +inline int64_t legacy_cat_wrap_dim( int64_t dim, - const std::vector>& tensor_sizes) { + const std::vector>& tensor_sizes) { for (auto& sizes : tensor_sizes) { if (sizes.size() == 1 && sizes[0] == 0) { continue; } - return maybe_wrap_dim(dim, sizes.size()); + return maybe_wrap_dim(dim, static_cast(sizes.size())); } return dim; } -inline int64_t legacy_cat_wrap_dim( - int64_t dim, - const std::vector>& tensor_sizes) { - return _legacy_cat_wrap_dim(dim, tensor_sizes); -} - inline int64_t legacy_cat_wrap_dim_symint( int64_t dim, const std::vector>& tensor_sizes) { - return _legacy_cat_wrap_dim(dim, tensor_sizes); + for (auto& sizes : tensor_sizes) { + if (sizes.size() == 1) { + if (TORCH_GUARD_SIZE_OBLIVIOUS(sizes[0].sym_eq(0))) { + continue; + } + } + return maybe_wrap_dim(dim, static_cast(sizes.size())); + } + return dim; } inline int64_t legacy_cat_wrap_dim( int64_t dim, const MaterializedITensorListRef& tensors) { for (const Tensor& tensor : tensors) { - if (tensor.dim() == 1 && tensor.sizes()[0] == 0) { - continue; + if (tensor.dim() == 1) { + if (TORCH_GUARD_SIZE_OBLIVIOUS(tensor.sym_sizes()[0].sym_eq(0))) { + continue; + } } return maybe_wrap_dim(dim, tensor.dim()); } diff --git a/aten/src/ATen/autocast_mode.cpp b/aten/src/ATen/autocast_mode.cpp index 7282bba9e6889..c233f17b44580 100644 --- a/aten/src/ATen/autocast_mode.cpp +++ b/aten/src/ATen/autocast_mode.cpp @@ -6,60 +6,14 @@ namespace at::autocast { -bool is_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCUDA); +bool is_autocast_enabled(at::DeviceType device_type) { + at::DispatchKey dispatch_key = get_autocast_dispatch_key_from_device_type(device_type); + return !c10::impl::tls_is_dispatch_key_excluded(dispatch_key); } -void set_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCUDA, !new_enabled); -} - -bool is_cpu_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastCPU); -} - -void set_cpu_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastCPU, !new_enabled); -} - -bool is_xpu_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastXPU); -} - -void set_xpu_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastXPU, !new_enabled); -} - -bool is_ipu_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastIPU); -} - -void set_ipu_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastIPU, !new_enabled); -} - -bool is_hpu_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastHPU); -} - -void set_hpu_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastHPU, !new_enabled); -} - -bool is_xla_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastXLA); -} - -void set_xla_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastXLA, !new_enabled); -} - -bool is_privateuseone_enabled() { - return !c10::impl::tls_is_dispatch_key_excluded(DispatchKey::AutocastPrivateUse1); -} - -void set_privateuseone_enabled(bool new_enabled) { - c10::impl::tls_set_dispatch_key_excluded(DispatchKey::AutocastPrivateUse1, !new_enabled); +void set_autocast_enabled(at::DeviceType device_type, bool enabled) { + at::DispatchKey dispatch_key = get_autocast_dispatch_key_from_device_type(device_type); + c10::impl::tls_set_dispatch_key_excluded(dispatch_key, !enabled); } namespace { @@ -91,30 +45,40 @@ std::mutex cached_casts_mutex; // it calls clear_cache() to ensure cached Tensors don't leak outside the autocasting region. thread_local int nesting = 0; -// autocast_cpu_dtype is the lower_precision_fp used by AutocastCPU. -thread_local at::ScalarType autocast_cpu_dtype = at::kBFloat16; - -// autocast_xpu_dtype is the lower_precision_fp used by AutocastXPU. -thread_local at::ScalarType autocast_xpu_dtype = at::kBFloat16; - -// autocast_ipu_dtype is the lower_precision_fp used by AutocastIPU. -thread_local at::ScalarType autocast_ipu_dtype = at::kHalf; - -// autocast_hpu_dtype is the lower_precision_fp used by AutocastHPU. -thread_local at::ScalarType autocast_hpu_dtype = at::kBFloat16; - -// autocast_xla_dtype is the lower_precision_fp used by AutocastXLA. -thread_local at::ScalarType autocast_xla_dtype = at::kBFloat16; +// The order of this array MUST exactly match the definition order of DeviceType +// in c10/core/DeviceType.h. +static_assert( + at::COMPILE_TIME_MAX_DEVICE_TYPES == 21, + "The definition of the default autocast data type per device backend doesn't match with the definition of the device type."); +thread_local std::array + autocast_dtype = { + at::kBFloat16, // CPU + at::kHalf, // CUDA. + at::ScalarType::Undefined, // Reserved for explicit MKLDNN + at::ScalarType::Undefined, // OpenGL + at::ScalarType::Undefined, // OpenCL + at::ScalarType::Undefined, // IDEEP. + at::kHalf, // AMD HIP + at::ScalarType::Undefined, // FPGA + at::ScalarType::Undefined, // ONNX Runtime / Microsoft + at::kBFloat16, // XLA / TPU + at::ScalarType::Undefined, // Vulkan + at::ScalarType::Undefined, // Metal + at::kBFloat16, // XPU + at::ScalarType::Undefined, // MPS + at::ScalarType::Undefined, // Meta (tensors with no data) + at::kBFloat16, // HPU / HABANA + at::ScalarType::Undefined, // SX-Aurora / NEC + at::ScalarType::Undefined, // Lazy Tensors + at::kHalf, // Graphcore IPU + at::ScalarType::Undefined, // Meta training and inference devices + at::kHalf, // PrivateUse1 device +}; // should we enabled the cache inside autocast. thread_local bool cache_enabled = true; -// autocast_gpu_dtype is the lower_precision_fp used by AutocastGPU. -thread_local at::ScalarType autocast_gpu_dtype = at::kHalf; - -// autocast_privateuseone_dtype is the lower_precision_fp used by AutocastPrivateUse1. -thread_local at::ScalarType autocast_privateuseone_dtype = at::kHalf; -} +} // anonymous namespace void clear_cache() { const std::lock_guard lock(cached_casts_mutex); @@ -129,60 +93,12 @@ int decrement_nesting() { return --nesting; } -at::ScalarType get_autocast_gpu_dtype() { - return autocast_gpu_dtype; +at::ScalarType get_autocast_dtype(at::DeviceType device_type) { + return autocast_dtype[static_cast(device_type)]; } -at::ScalarType get_autocast_cpu_dtype() { - return autocast_cpu_dtype; -} - -at::ScalarType get_autocast_xpu_dtype() { - return autocast_xpu_dtype; -} - -at::ScalarType get_autocast_ipu_dtype() { - return autocast_ipu_dtype; -} - -at::ScalarType get_autocast_hpu_dtype() { - return autocast_hpu_dtype; -} - -at::ScalarType get_autocast_xla_dtype() { - return autocast_xla_dtype; -} - -at::ScalarType get_autocast_privateuseone_dtype() { - return autocast_privateuseone_dtype; -} - -void set_autocast_cpu_dtype(at::ScalarType dtype) { - autocast_cpu_dtype = dtype; -} - -void set_autocast_gpu_dtype(at::ScalarType dtype) { - autocast_gpu_dtype = dtype; -} - -void set_autocast_xpu_dtype(at::ScalarType dtype) { - autocast_xpu_dtype = dtype; -} - -void set_autocast_ipu_dtype(at::ScalarType dtype) { - autocast_ipu_dtype = dtype; -} - -void set_autocast_hpu_dtype(at::ScalarType dtype) { - autocast_hpu_dtype = dtype; -} - -void set_autocast_xla_dtype(at::ScalarType dtype) { - autocast_xla_dtype = dtype; -} - -void set_autocast_privateuseone_dtype(at::ScalarType dtype) { - autocast_privateuseone_dtype = dtype; +void set_autocast_dtype(at::DeviceType device_type, at::ScalarType dtype) { + autocast_dtype[static_cast(device_type)] = dtype; } bool is_autocast_cache_enabled() { @@ -241,135 +157,46 @@ namespace { /***************************************** Explicit registration for out-of-place ops *****************************************/ + TORCH_LIBRARY_IMPL(_, Autocast, m) { m.fallback(torch::CppFunction::makeFallthrough()); } TORCH_LIBRARY_IMPL(aten, Autocast, m) { // lower_precision_fp - KERNEL_CUDA2(_convolution, deprecated, lower_precision_fp) - KERNEL_CUDA(_convolution, lower_precision_fp) - KERNEL_CUDA(conv1d, lower_precision_fp) - KERNEL_CUDA(conv2d, lower_precision_fp) - KERNEL_CUDA(conv3d, lower_precision_fp) - KERNEL_CUDA(conv_tbc, lower_precision_fp) - KERNEL_CUDA(conv_transpose1d, lower_precision_fp) - KERNEL_CUDA2(conv_transpose2d, input, lower_precision_fp) - KERNEL_CUDA2(conv_transpose3d, input, lower_precision_fp) - KERNEL_CUDA(convolution, lower_precision_fp) +#define _KERNEL_CUDA_LOW_PRECISION_FP(...) \ + KERNEL_CUDA(__VA_ARGS__, lower_precision_fp) + + AT_FORALL_LOWER_PRECISION_FP(_KERNEL_CUDA_LOW_PRECISION_FP) KERNEL_CUDA(cudnn_convolution, lower_precision_fp) KERNEL_CUDA(cudnn_convolution_transpose, lower_precision_fp) - KERNEL_CUDA(prelu, lower_precision_fp) - KERNEL_CUDA(addmm, lower_precision_fp) - KERNEL_CUDA(addmv, lower_precision_fp) - KERNEL_CUDA(addr, lower_precision_fp) - KERNEL_CUDA(matmul, lower_precision_fp) - KERNEL_CUDA(einsum, lower_precision_fp) - KERNEL_CUDA(mm, lower_precision_fp) - KERNEL_CUDA(mv, lower_precision_fp) - KERNEL_CUDA(linalg_vecdot, lower_precision_fp) - KERNEL_CUDA(linear, lower_precision_fp) - KERNEL_CUDA(addbmm, lower_precision_fp) - KERNEL_CUDA(baddbmm, lower_precision_fp) - KERNEL_CUDA(bmm, lower_precision_fp) - KERNEL_CUDA(chain_matmul, lower_precision_fp) - KERNEL_CUDA(linalg_multi_dot, lower_precision_fp) - KERNEL_CUDA(_thnn_fused_lstm_cell, lower_precision_fp) - KERNEL_CUDA(_thnn_fused_gru_cell, lower_precision_fp) - KERNEL_CUDA(lstm_cell, lower_precision_fp) - KERNEL_CUDA(gru_cell, lower_precision_fp) - KERNEL_CUDA(rnn_tanh_cell, lower_precision_fp) - KERNEL_CUDA(rnn_relu_cell, lower_precision_fp) - KERNEL_CUDA(_scaled_dot_product_flash_attention, lower_precision_fp) - KERNEL_CUDA(scaled_dot_product_attention, lower_precision_fp) // fp32 - KERNEL_CUDA(acos, fp32) - KERNEL_CUDA(asin, fp32) - KERNEL_CUDA(cosh, fp32) - KERNEL_CUDA(erfinv, fp32) - KERNEL_CUDA(exp, fp32) - KERNEL_CUDA(expm1, fp32) - KERNEL_CUDA(log, fp32) - KERNEL_CUDA(log10, fp32) - KERNEL_CUDA(log2, fp32) - KERNEL_CUDA(log1p, fp32) - KERNEL_CUDA(reciprocal, fp32) - KERNEL_CUDA(rsqrt, fp32) - KERNEL_CUDA(sinh, fp32) - KERNEL_CUDA(tan, fp32) - KERNEL_CUDA2(pow, Tensor_Scalar, fp32) - KERNEL_CUDA2(pow, Tensor_Tensor, fp32) - KERNEL_CUDA2(pow, Scalar, fp32) - KERNEL_CUDA(softplus, fp32) - KERNEL_CUDA(layer_norm, fp32) - KERNEL_CUDA(native_layer_norm, fp32) - KERNEL_CUDA(group_norm, fp32) - KERNEL_CUDA2(frobenius_norm, dim, fp32) - KERNEL_CUDA(nuclear_norm, fp32) - KERNEL_CUDA2(nuclear_norm, dim, fp32) - KERNEL_CUDA(cosine_similarity, fp32) - KERNEL_CUDA(poisson_nll_loss, fp32) - KERNEL_CUDA(cosine_embedding_loss, fp32) - KERNEL_CUDA(nll_loss, fp32) - KERNEL_CUDA(nll_loss2d, fp32) - KERNEL_CUDA(hinge_embedding_loss, fp32) - KERNEL_CUDA(kl_div, fp32) - KERNEL_CUDA(l1_loss, fp32) - KERNEL_CUDA(smooth_l1_loss, fp32) - KERNEL_CUDA(huber_loss, fp32) - KERNEL_CUDA(mse_loss, fp32) - KERNEL_CUDA(margin_ranking_loss, fp32) - KERNEL_CUDA(multilabel_margin_loss, fp32) - KERNEL_CUDA(soft_margin_loss, fp32) - KERNEL_CUDA(triplet_margin_loss, fp32) - KERNEL_CUDA(multi_margin_loss, fp32) - KERNEL_CUDA(binary_cross_entropy_with_logits, fp32) - KERNEL_CUDA(dist, fp32) - KERNEL_CUDA(pdist, fp32) - KERNEL_CUDA(cdist, fp32) - KERNEL_CUDA(renorm, fp32) - KERNEL_CUDA(logsumexp, fp32) +#define _KERNEL_CUDA_FP32(...) KERNEL_CUDA(__VA_ARGS__, fp32) + + AT_FORALL_FP32(_KERNEL_CUDA_FP32) + // fp32_set_opt_dtype - KERNEL_CUDA(prod, fp32_set_opt_dtype) - KERNEL_CUDA2(prod, dim_int, fp32_set_opt_dtype) - KERNEL_CUDA2(prod, dim_Dimname, fp32_set_opt_dtype) - KERNEL_CUDA2(softmax, int, fp32_set_opt_dtype) - KERNEL_CUDA2(softmax, Dimname, fp32_set_opt_dtype) - KERNEL_CUDA2(log_softmax, int, fp32_set_opt_dtype) - KERNEL_CUDA2(log_softmax, Dimname, fp32_set_opt_dtype) - KERNEL_CUDA(cumprod, fp32_set_opt_dtype) - KERNEL_CUDA2(cumprod, dimname, fp32_set_opt_dtype) - KERNEL_CUDA(cumsum, fp32_set_opt_dtype) - KERNEL_CUDA2(cumsum, dimname, fp32_set_opt_dtype) - KERNEL_CUDA(linalg_vector_norm, fp32_set_opt_dtype) - KERNEL_CUDA(linalg_matrix_norm, fp32_set_opt_dtype) - KERNEL_CUDA2(linalg_matrix_norm, str_ord, fp32_set_opt_dtype) +#define _KERNEL_CUDA_FP32_SET_OPT_DTYPE(...) \ + KERNEL_CUDA(__VA_ARGS__, fp32_set_opt_dtype) + + AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_CUDA_FP32_SET_OPT_DTYPE) // commenting these out because they accept an explicit (not-optional) dtype, and we shouldn't try to flip that even // when autocasting. - // KERNEL_CUDA2(norm, ScalarOpt_dtype, fp32_set_opt_dtype) - // KERNEL_CUDA2(norm, ScalarOpt_dim_dtype, fp32_set_opt_dtype) - // KERNEL_CUDA2(norm, names_ScalarOpt_dim_dtype, fp32_set_opt_dtype) - KERNEL_CUDA(sum, fp32_set_opt_dtype) - KERNEL_CUDA2(sum, dim_IntList, fp32_set_opt_dtype) - KERNEL_CUDA2(sum, dim_DimnameList, fp32_set_opt_dtype) + // KERNEL_CUDA(norm, ScalarOpt_dtype, fp32_set_opt_dtype) + // KERNEL_CUDA(norm, ScalarOpt_dim_dtype, fp32_set_opt_dtype) + // KERNEL_CUDA(norm, names_ScalarOpt_dim_dtype, fp32_set_opt_dtype) + // fp32_append_dtype // The fp32_append_dtype wrapper overrides implicit promotion behavior. // norm does not implicitly promote, but be aware when adding new ops to this policy. - KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.Scalar", Tensor (const Tensor &, const Scalar&), Tensor (const Tensor &, const c10::optional&, ScalarType), fp32_append_dtype) - KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.ScalarOpt_dim", Tensor (const Tensor &, const c10::optional&, IntArrayRef, bool), Tensor (const Tensor &, const c10::optional&, IntArrayRef, bool, ScalarType), fp32_append_dtype) - KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA(ADD_NS(norm), "norm.names_ScalarOpt_dim", Tensor (const Tensor &, const c10::optional&, DimnameList, bool), Tensor (const Tensor &, const c10::optional&, DimnameList, bool, ScalarType), fp32_append_dtype) + AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE( + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA) + // promote - KERNEL_CUDA(addcdiv, promote) - KERNEL_CUDA(addcmul, promote) - KERNEL_CUDA(atan2, promote) - KERNEL_CUDA(bilinear, promote) - KERNEL_CUDA(cross, promote) - KERNEL_CUDA(dot, promote) - KERNEL_CUDA(grid_sampler, promote) - KERNEL_CUDA(index_put, promote) - KERNEL_CUDA(tensordot, promote) - KERNEL_CUDA(scatter_add, promote) +#define _KERNEL_CUDA_PROMOTE(...) KERNEL_CUDA(__VA_ARGS__, promote) + + AT_FORALL_PROMOTE(_KERNEL_CUDA_PROMOTE) m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"), TORCH_FN((&at::autocast::binary_cross_entropy_banned))); @@ -383,11 +210,11 @@ TORCH_LIBRARY_IMPL(_, AutocastCPU, m) { TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { // lower_precision_fp cast policy KERNEL_CPU(conv1d, lower_precision_fp) - KERNEL_CPU2(conv1d, padding, lower_precision_fp) + KERNEL_CPU(conv1d, padding, lower_precision_fp) KERNEL_CPU(conv2d, lower_precision_fp) - KERNEL_CPU2(conv2d, padding, lower_precision_fp) + KERNEL_CPU(conv2d, padding, lower_precision_fp) KERNEL_CPU(conv3d, lower_precision_fp) - KERNEL_CPU2(conv3d, padding, lower_precision_fp) + KERNEL_CPU(conv3d, padding, lower_precision_fp) KERNEL_CPU(bmm, lower_precision_fp) KERNEL_CPU(mm, lower_precision_fp) KERNEL_CPU(linalg_vecdot, lower_precision_fp) @@ -395,13 +222,13 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(addmm, lower_precision_fp) KERNEL_CPU(addbmm, lower_precision_fp) KERNEL_CPU(linear, lower_precision_fp) - KERNEL_CPU2(_convolution, deprecated, lower_precision_fp) + KERNEL_CPU(_convolution, deprecated, lower_precision_fp) KERNEL_CPU(matmul, lower_precision_fp) KERNEL_CPU(conv_tbc, lower_precision_fp) KERNEL_CPU(mkldnn_rnn_layer, lower_precision_fp) KERNEL_CPU(conv_transpose1d, lower_precision_fp) - KERNEL_CPU2(conv_transpose2d, input, lower_precision_fp) - KERNEL_CPU2(conv_transpose3d, input, lower_precision_fp) + KERNEL_CPU(conv_transpose2d, input, lower_precision_fp) + KERNEL_CPU(conv_transpose3d, input, lower_precision_fp) KERNEL_CPU(prelu, lower_precision_fp) KERNEL_CPU(scaled_dot_product_attention, lower_precision_fp) KERNEL_CPU(_native_multi_head_attention, lower_precision_fp) @@ -412,14 +239,14 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(grid_sampler, fp32) KERNEL_CPU(polar, fp32) KERNEL_CPU(prod, fp32) - KERNEL_CPU2(prod, dim_int, fp32) - KERNEL_CPU2(prod, dim_Dimname, fp32) + KERNEL_CPU(prod, dim_int, fp32) + KERNEL_CPU(prod, dim_Dimname, fp32) KERNEL_CPU(quantile, fp32) - KERNEL_CPU2(quantile, scalar, fp32) + KERNEL_CPU(quantile, scalar, fp32) KERNEL_CPU(nanquantile, fp32) - KERNEL_CPU2(nanquantile, scalar, fp32) + KERNEL_CPU(nanquantile, scalar, fp32) KERNEL_CPU(stft, fp32) - KERNEL_CPU2(stft, center, fp32) + KERNEL_CPU(stft, center, fp32) KERNEL_CPU(cdist, fp32) KERNEL_CPU(grid_sampler_2d, fp32) KERNEL_CPU(_grid_sampler_2d_cpu_fallback, fp32) @@ -457,8 +284,8 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(soft_margin_loss, fp32) KERNEL_CPU(triplet_margin_loss, fp32) KERNEL_CPU(multi_margin_loss, fp32) - KERNEL_CPU2(ctc_loss, IntList, fp32) - KERNEL_CPU2(ctc_loss, Tensor, fp32) + KERNEL_CPU(ctc_loss, IntList, fp32) + KERNEL_CPU(ctc_loss, Tensor, fp32) KERNEL_CPU(kl_div, fp32) KERNEL_CPU(multilabel_margin_loss, fp32) KERNEL_CPU(binary_cross_entropy_with_logits, fp32) @@ -477,11 +304,11 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(fft_hfft, fp32) KERNEL_CPU(fft_ihfft, fp32) KERNEL_CPU(linalg_cond, fp32) - KERNEL_CPU2(linalg_cond, p_str, fp32) + KERNEL_CPU(linalg_cond, p_str, fp32) KERNEL_CPU(linalg_matrix_rank, fp32) - KERNEL_CPU2(linalg_matrix_rank, tol_tensor, fp32) - KERNEL_CPU2(linalg_matrix_rank, atol_rtol_tensor, fp32) - KERNEL_CPU2(linalg_matrix_rank, atol_rtol_float, fp32) + KERNEL_CPU(linalg_matrix_rank, tol_tensor, fp32) + KERNEL_CPU(linalg_matrix_rank, atol_rtol_tensor, fp32) + KERNEL_CPU(linalg_matrix_rank, atol_rtol_float, fp32) KERNEL_CPU(linalg_solve, fp32) KERNEL_CPU(linalg_cholesky, fp32) KERNEL_CPU(linalg_svdvals, fp32) @@ -513,8 +340,45 @@ TORCH_LIBRARY_IMPL(aten, AutocastCPU, m) { KERNEL_CPU(stack, promote) KERNEL_CPU(cat, promote) KERNEL_CPU(index_copy, promote) - KERNEL_CPU2(index_copy, dimname, promote) + KERNEL_CPU(index_copy, dimname, promote) + +} +TORCH_LIBRARY_IMPL(_, AutocastXPU, m) { + m.fallback(torch::CppFunction::makeFallthrough()); +} + +TORCH_LIBRARY_IMPL(aten, AutocastXPU, m) { + // lower_precision_fp +#define _KERNEL_XPU_LOW_PRECISION_FP(...) \ + KERNEL_XPU(__VA_ARGS__, lower_precision_fp) + + AT_FORALL_LOWER_PRECISION_FP(_KERNEL_XPU_LOW_PRECISION_FP) + + // fp32 +#define _KERNEL_XPU_FP32(...) KERNEL_XPU(__VA_ARGS__, fp32) + + AT_FORALL_FP32(_KERNEL_XPU_FP32) + + // fp32_set_opt_dtype +#define _KERNEL_XPU_FP32_SET_OPT_DTYPE(...) \ + KERNEL_XPU(__VA_ARGS__, fp32_set_opt_dtype) + + AT_FORALL_FP32_SET_OPT_DTYPE(_KERNEL_XPU_FP32_SET_OPT_DTYPE) + + // fp32_append_dtype + // The fp32_append_dtype wrapper overrides implicit promotion behavior. + // norm does not implicitly promote, but be aware when adding new ops to this policy. + AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE( + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU) + + // promote +#define _KERNEL_XPU_PROMOTE(...) KERNEL_XPU(__VA_ARGS__, promote) + + AT_FORALL_PROMOTE(_KERNEL_XPU_PROMOTE) + + m.impl(TORCH_SELECTIVE_NAME("aten::binary_cross_entropy"), + TORCH_FN((&at::autocast::binary_cross_entropy_banned))); } } // namespace diff --git a/aten/src/ATen/autocast_mode.h b/aten/src/ATen/autocast_mode.h index b3f2fcd511ff6..59a91848a5175 100644 --- a/aten/src/ATen/autocast_mode.h +++ b/aten/src/ATen/autocast_mode.h @@ -10,40 +10,120 @@ namespace at::autocast { -TORCH_API bool is_enabled(); -TORCH_API void set_enabled(bool enabled); +TORCH_API bool is_autocast_enabled(at::DeviceType device_type); +TORCH_API void set_autocast_enabled(at::DeviceType device_type, bool enabled); +TORCH_API at::ScalarType get_autocast_dtype(at::DeviceType device_type); +TORCH_API void set_autocast_dtype( + at::DeviceType device_type, + at::ScalarType dtype); TORCH_API void clear_cache(); TORCH_API int increment_nesting(); TORCH_API int decrement_nesting(); -TORCH_API bool is_cpu_enabled(); -TORCH_API void set_cpu_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_gpu_dtype(); -TORCH_API at::ScalarType get_autocast_cpu_dtype(); -TORCH_API void set_autocast_gpu_dtype(at::ScalarType dtype); -TORCH_API void set_autocast_cpu_dtype(at::ScalarType dtype); -TORCH_API bool is_xpu_enabled(); -TORCH_API void set_xpu_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_xpu_dtype(); -TORCH_API void set_autocast_xpu_dtype(at::ScalarType dtype); -TORCH_API bool is_ipu_enabled(); -TORCH_API void set_ipu_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_ipu_dtype(); -TORCH_API void set_autocast_ipu_dtype(at::ScalarType dtype); -TORCH_API bool is_hpu_enabled(); -TORCH_API void set_hpu_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_hpu_dtype(); -TORCH_API void set_autocast_hpu_dtype(at::ScalarType dtype); -TORCH_API bool is_xla_enabled(); -TORCH_API void set_xla_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_xla_dtype(); -TORCH_API void set_autocast_xla_dtype(at::ScalarType dtype); -TORCH_API bool is_privateuseone_enabled(); -TORCH_API void set_privateuseone_enabled(bool enabled); -TORCH_API at::ScalarType get_autocast_privateuseone_dtype(); -TORCH_API void set_autocast_privateuseone_dtype(at::ScalarType dtype); TORCH_API bool is_autocast_cache_enabled(); TORCH_API void set_autocast_cache_enabled(bool enabled); +// deprecated CUDA-specific autocast APIs +C10_DEPRECATED_MESSAGE( + "at::autocast::is_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.") +TORCH_API inline bool is_enabled() { + TORCH_WARN_DEPRECATION( + "at::autocast::", + __func__, + "() is deprecated. Please use at::autocast::is_autocast_enabled(at::kCUDA) instead.") + return is_autocast_enabled(at::kCUDA); +} +C10_DEPRECATED_MESSAGE( + "at::autocast::set_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.") +TORCH_API inline void set_enabled(bool enabled) { + TORCH_WARN_DEPRECATION( + "at::autocast::", + __func__, + "(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(at::kCUDA, enabled) instead.") + set_autocast_enabled(at::kCUDA, enabled); +} +C10_DEPRECATED_MESSAGE( + "at::autocast::get_autocast_gpu_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.") +TORCH_API inline at::ScalarType get_autocast_gpu_dtype() { + TORCH_WARN_DEPRECATION( + "at::autocast::", + __func__, + "() is deprecated. Please use at::autocast::get_autocast_dtype(at::kCUDA) instead.") + return get_autocast_dtype(at::kCUDA); +} +C10_DEPRECATED_MESSAGE( + "at::autocast::set_autocast_gpu_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.") +TORCH_API inline void set_autocast_gpu_dtype(at::ScalarType dtype) { + TORCH_WARN_DEPRECATION( + "at::autocast::", + __func__, + "(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(at::kCUDA, dtype) instead.") + set_autocast_dtype(at::kCUDA, dtype); +} + +#define DECLARE_DEPRECATED_AUTOCAST_APIS(name, device_type) \ + C10_DEPRECATED_MESSAGE( \ + "at::autocast::is_" #name \ + "_enabled() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type \ + ") instead.") \ + TORCH_API inline bool is_##name##_enabled() { \ + TORCH_WARN_DEPRECATION( \ + "at::autocast::", \ + __func__, \ + "() is deprecated. Please use at::autocast::is_autocast_enabled(" #device_type \ + ") instead.") \ + return is_autocast_enabled(device_type); \ + } \ + \ + C10_DEPRECATED_MESSAGE( \ + "at::autocast::set_" #name \ + "_enabled(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \ + ", enabled) instead.") \ + TORCH_API inline void set_##name##_enabled(bool enabled) { \ + TORCH_WARN_DEPRECATION( \ + "at::autocast::", \ + __func__, \ + "(enabled) is deprecated. Please use at::autocast::set_autocast_enabled(" #device_type \ + ", enabled) instead.") \ + set_autocast_enabled(device_type, enabled); \ + } \ + \ + C10_DEPRECATED_MESSAGE( \ + "at::autocast::get_autocast_" #name \ + "_dtype() is deprecated. Please use at::autocast::get_autocast_dtype(" #device_type \ + ") instead.") \ + TORCH_API inline at::ScalarType get_autocast_##name##_dtype() { \ + TORCH_WARN_DEPRECATION( \ + "at::autocast::", \ + __func__, \ + "() is deprecated. Please at::autocast::get_autocast_dtype(" #device_type \ + ") instead.") \ + return get_autocast_dtype(device_type); \ + } \ + \ + C10_DEPRECATED_MESSAGE( \ + "at::autocast::set_autocast_" #name \ + "_dtype(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type \ + ", dtype) instead.") \ + TORCH_API inline void set_autocast_##name##_dtype(at::ScalarType dtype) { \ + TORCH_WARN_DEPRECATION( \ + "at::autocast::", \ + __func__, \ + "(dtype) is deprecated. Please use at::autocast::set_autocast_dtype(" #device_type \ + ", dtype) instead.") \ + set_autocast_dtype(device_type, dtype); \ + } + +#define AT_FORALL_DEPRECATED_AUTOCAST_BAKCNEDS(_) \ + _(cpu, at::kCPU) \ + _(xpu, at::kXPU) \ + _(xla, at::kXLA) \ + _(hpu, at::kHPU) \ + _(ipu, at::kIPU) \ + _(privateuseone, at::kPrivateUse1) + +// deprecated other backend specific autocast APIs +AT_FORALL_DEPRECATED_AUTOCAST_BAKCNEDS(DECLARE_DEPRECATED_AUTOCAST_APIS) + namespace { inline bool is_autocast_eligible( const Tensor& tensor, @@ -94,26 +174,24 @@ inline DispatchKey get_autocast_dispatch_key_from_device_type( } } +inline bool is_autocast_available(c10::DeviceType device_type) { + if (device_type == at::kCPU || device_type == at::kCUDA || + device_type == at::kXPU || device_type == at::kIPU || + device_type == at::kHPU || device_type == at::kXLA || + device_type == at::kPrivateUse1) { + return true; + } else { + return false; + } +} + inline at::ScalarType get_lower_precision_fp_from_device_type( c10::DeviceType device_type) { - switch (device_type) { - case c10::DeviceType::CUDA: - return get_autocast_gpu_dtype(); - case c10::DeviceType::CPU: - return get_autocast_cpu_dtype(); - case c10::DeviceType::XPU: - return get_autocast_xpu_dtype(); - case c10::DeviceType::IPU: - return get_autocast_ipu_dtype(); - case c10::DeviceType::HPU: - return get_autocast_hpu_dtype(); - case c10::DeviceType::XLA: - return get_autocast_xla_dtype(); - case c10::DeviceType::PrivateUse1: - return get_autocast_privateuseone_dtype(); - default: - throw std::runtime_error( - "unknown device type for autocast in get_lower_precision_fp_from_device_type"); + if (is_autocast_available(device_type)) { + return get_autocast_dtype(device_type); + } else { + throw std::runtime_error( + "unknown device type for autocast in get_lower_precision_fp_from_device_type"); } } @@ -541,9 +619,13 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. #define ADD_NS(RAW_OP) at::RAW_OP +#define _KERNEL_OVERLOAD_NARG_IMPL(_0, _1, _2, N, ...) N +#define _KERNEL_OVERLOAD_NARG(...) \ + C10_EXPAND_MSVC_WORKAROUND(_KERNEL_OVERLOAD_NARG_IMPL(__VA_ARGS__, 2, 1)) + // Common cases where registration signature matches redispatch signature // (that's why SIGNATURE is repeated in the WrapFunction instantiation) -#define KERNEL(DISPATCHKEY, OP, POLICY) \ +#define KERNEL1(DISPATCHKEY, OP, POLICY) \ m.impl( \ TORCH_SELECTIVE_NAME("aten::" #OP), \ &::at::autocast::WrapFunction< \ @@ -563,6 +645,15 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. decltype(ATEN_FN2(OP, OVERLOAD)), \ &ATEN_FN2(OP, OVERLOAD)>::type::call); +#define _KERNEL_DISPATCH(DISPATCHKEY, NARG, ...) \ + C10_CONCATENATE(KERNEL, NARG)(DISPATCHKEY, __VA_ARGS__) + +#define _KERNEL_IMPL(DISPATCHKEY, ...) \ + _KERNEL_DISPATCH(DISPATCHKEY, _KERNEL_OVERLOAD_NARG(__VA_ARGS__), __VA_ARGS__) + +// It will dispatch to KERNEL1 or KERNEL2 based on its inputs. +#define KERNEL(DISPATCHKEY, ...) _KERNEL_IMPL(DISPATCHKEY, __VA_ARGS__) + // Less-common but still useful case: redispatching to a function // with a new signature (e.g. appending a dtype) #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE( \ @@ -581,12 +672,9 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. REDISPATCH_SIGNATURE, \ &REDISPATCH_FUNC>::type::call); -// KERNEL_CPU/KERNEL_CPU2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU -// registration for AutocastCPU -#define KERNEL_CPU(OP, POLICY) KERNEL(c10::DeviceType::CPU, OP, POLICY) - -#define KERNEL_CPU2(OP, OVERLOAD, POLICY) \ - KERNEL2(c10::DeviceType::CPU, OP, OVERLOAD, POLICY) +// KERNEL_CPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU +// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastCPU +#define KERNEL_CPU(...) KERNEL(c10::DeviceType::CPU, __VA_ARGS__) #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CPU( \ REDISPATCH_FUNC, \ @@ -602,12 +690,9 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. REDISPATCH_SIGNATURE, \ POLICY) -// KERNEL_CUDA/KERNEL_CUDA2/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA -// registration for AutocastCUDA -#define KERNEL_CUDA(OP, POLICY) KERNEL(c10::DeviceType::CUDA, OP, POLICY) - -#define KERNEL_CUDA2(OP, OVERLOAD, POLICY) \ - KERNEL2(c10::DeviceType::CUDA, OP, OVERLOAD, POLICY) +// KERNEL_CUDA/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA +// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastCUDA +#define KERNEL_CUDA(...) KERNEL(c10::DeviceType::CUDA, __VA_ARGS__) #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_CUDA( \ REDISPATCH_FUNC, \ @@ -623,14 +708,28 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. REDISPATCH_SIGNATURE, \ POLICY) -// KERNEL_PRIVATEUSEONE/KERNEL_PRIVATEUSEONE2/ -// KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE -// registration for AutocastPrivateUse1 -#define KERNEL_PRIVATEUSEONE(OP, POLICY) \ - KERNEL(c10::DeviceType::PrivateUse1, OP, POLICY) +// KERNEL_XPU/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU +// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastXPU +#define KERNEL_XPU(...) KERNEL(c10::DeviceType::XPU, __VA_ARGS__) -#define KERNEL_PRIVATEUSEONE2(OP, OVERLOAD, POLICY) \ - KERNEL2(c10::DeviceType::PrivateUse1, OP, OVERLOAD, POLICY) +#define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_XPU( \ + REDISPATCH_FUNC, \ + REGISTER_NAME, \ + REGISTER_SIGNATURE, \ + REDISPATCH_SIGNATURE, \ + POLICY) \ + KERNEL_DIFFERENT_REDISPATCH_SIGNATURE( \ + c10::DeviceType::XPU, \ + REDISPATCH_FUNC, \ + REGISTER_NAME, \ + REGISTER_SIGNATURE, \ + REDISPATCH_SIGNATURE, \ + POLICY) + +// KERNEL_PRIVATEUSEONE/KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE +// registration (OP, POLICY) or (OP, OVERLOAD, POLICY) for AutocastPrivateUse1 +#define KERNEL_PRIVATEUSEONE(...) \ + KERNEL(c10::DeviceType::PrivateUse1, __VA_ARGS__) #define KERNEL_DIFFERENT_REDISPATCH_SIGNATURE_PRIVATEUSEONE( \ REDISPATCH_FUNC, \ @@ -645,3 +744,158 @@ copy pasted in from VariableTypeEverything.cpp with appropriate substitutions. REGISTER_SIGNATURE, \ REDISPATCH_SIGNATURE, \ POLICY) + +// Op lists for different policies. +// To make sure other backends can reuse the policy op list. +#define AT_FORALL_LOWER_PRECISION_FP(_) \ + _(_convolution, deprecated) \ + _(_convolution) \ + _(conv1d) \ + _(conv2d) \ + _(conv3d) \ + _(conv_tbc) \ + _(conv_transpose1d) \ + _(conv_transpose2d, input) \ + _(conv_transpose3d, input) \ + _(convolution) \ + _(prelu) \ + _(addmm) \ + _(addmv) \ + _(addr) \ + _(matmul) \ + _(einsum) \ + _(mm) \ + _(mv) \ + _(linalg_vecdot) \ + _(linear) \ + _(addbmm) \ + _(baddbmm) \ + _(bmm) \ + _(chain_matmul) \ + _(linalg_multi_dot) \ + _(_thnn_fused_lstm_cell) \ + _(_thnn_fused_gru_cell) \ + _(lstm_cell) \ + _(gru_cell) \ + _(rnn_tanh_cell) \ + _(rnn_relu_cell) \ + _(_scaled_dot_product_flash_attention) \ + _(scaled_dot_product_attention) + +#define AT_FORALL_FP32(_) \ + _(acos) \ + _(asin) \ + _(cosh) \ + _(erfinv) \ + _(exp) \ + _(expm1) \ + _(log) \ + _(log10) \ + _(log2) \ + _(log1p) \ + _(reciprocal) \ + _(rsqrt) \ + _(sinh) \ + _(tan) \ + _(pow, Tensor_Scalar) \ + _(pow, Tensor_Tensor) \ + _(pow, Scalar) \ + _(softplus) \ + _(layer_norm) \ + _(native_layer_norm) \ + _(group_norm) \ + _(frobenius_norm, dim) \ + _(nuclear_norm) \ + _(nuclear_norm, dim) \ + _(cosine_similarity) \ + _(poisson_nll_loss) \ + _(cosine_embedding_loss) \ + _(nll_loss) \ + _(nll_loss2d) \ + _(hinge_embedding_loss) \ + _(kl_div) \ + _(l1_loss) \ + _(smooth_l1_loss) \ + _(huber_loss) \ + _(mse_loss) \ + _(margin_ranking_loss) \ + _(multilabel_margin_loss) \ + _(soft_margin_loss) \ + _(triplet_margin_loss) \ + _(multi_margin_loss) \ + _(binary_cross_entropy_with_logits) \ + _(dist) \ + _(pdist) \ + _(cdist) \ + _(renorm) \ + _(logsumexp) \ + _(upsample_nearest1d) \ + _(_upsample_nearest_exact1d) \ + _(upsample_nearest2d) \ + _(_upsample_nearest_exact2d) \ + _(upsample_nearest3d) \ + _(_upsample_nearest_exact3d) \ + _(upsample_linear1d) \ + _(upsample_bilinear2d) \ + _(_upsample_bilinear2d_aa) \ + _(upsample_trilinear3d) \ + _(upsample_bicubic2d) \ + _(_upsample_bicubic2d_aa) + +#define AT_FORALL_FP32_SET_OPT_DTYPE(_) \ + _(prod) \ + _(prod, dim_int) \ + _(prod, dim_Dimname) \ + _(softmax, int) \ + _(softmax, Dimname) \ + _(log_softmax, int) \ + _(log_softmax, Dimname) \ + _(cumprod) \ + _(cumprod, dimname) \ + _(cumsum) \ + _(cumsum, dimname) \ + _(linalg_vector_norm) \ + _(linalg_matrix_norm) \ + _(linalg_matrix_norm, str_ord) \ + _(sum) \ + _(sum, dim_IntList) \ + _(sum, dim_DimnameList) + +#define AT_FORALL_DIFFERENT_REDISPATCH_SIGNATURE(_) \ + _(ADD_NS(norm), \ + "norm.Scalar", \ + Tensor(const Tensor&, const Scalar&), \ + Tensor(const Tensor&, const c10::optional&, ScalarType), \ + fp32_append_dtype) \ + _(ADD_NS(norm), \ + "norm.ScalarOpt_dim", \ + Tensor(const Tensor&, const c10::optional&, IntArrayRef, bool), \ + Tensor( \ + const Tensor&, \ + const c10::optional&, \ + IntArrayRef, \ + bool, \ + ScalarType), \ + fp32_append_dtype) \ + _(ADD_NS(norm), \ + "norm.names_ScalarOpt_dim", \ + Tensor(const Tensor&, const c10::optional&, DimnameList, bool), \ + Tensor( \ + const Tensor&, \ + const c10::optional&, \ + DimnameList, \ + bool, \ + ScalarType), \ + fp32_append_dtype) + +#define AT_FORALL_PROMOTE(_) \ + _(addcdiv) \ + _(addcmul) \ + _(atan2) \ + _(bilinear) \ + _(cross) \ + _(dot) \ + _(grid_sampler) \ + _(index_put) \ + _(tensordot) \ + _(scatter_add) diff --git a/aten/src/ATen/ceil_div.h b/aten/src/ATen/ceil_div.h index 2c13ff8115a09..37d67b232a22c 100644 --- a/aten/src/ATen/ceil_div.h +++ b/aten/src/ATen/ceil_div.h @@ -7,7 +7,7 @@ namespace at { /** Computes ceil(a / b) */ -template ::value>> +template >> C10_ALWAYS_INLINE C10_HOST_DEVICE T ceil_div(T a, T b) { return (a + b - 1) / b; } diff --git a/aten/src/ATen/code_template.h b/aten/src/ATen/code_template.h index 14ac2fa171561..393e322e6fe66 100644 --- a/aten/src/ATen/code_template.h +++ b/aten/src/ATen/code_template.h @@ -7,8 +7,7 @@ #include #include -namespace at { -namespace jit { +namespace at::jit { // A template environment is a mapping from template variable names, e.g., // identifier (corresponding to $identifier) to their expansions. @@ -241,5 +240,4 @@ static inline std::string format(const std::string& fmt, TemplateEnv& env) { return CodeTemplate(fmt).format(env); } -} // namespace jit -} // namespace at +} // namespace at::jit diff --git a/aten/src/ATen/core/ATen_pch.h b/aten/src/ATen/core/ATen_pch.h index 1f36d0ab9f87b..57ca22bf4377a 100644 --- a/aten/src/ATen/core/ATen_pch.h +++ b/aten/src/ATen/core/ATen_pch.h @@ -110,6 +110,8 @@ #include #include #include +#include +#include #include #include #include diff --git a/aten/src/ATen/core/Array.h b/aten/src/ATen/core/Array.h index 300ae51cef6b9..8372fe81c5c5a 100644 --- a/aten/src/ATen/core/Array.h +++ b/aten/src/ATen/core/Array.h @@ -6,10 +6,11 @@ #include #include -namespace at { namespace detail { +namespace at::detail { template struct Array { + // NOLINTNEXTLINE(*c-array*) T data[size_]; C10_HOST_DEVICE T operator[](int i) const { @@ -27,7 +28,9 @@ struct Array { Array(const Array&) = default; Array& operator=(const Array&) = default; #endif - static constexpr int size(){return size_;} + static constexpr int size() { + return size_; + } // Fill the array with x. C10_HOST_DEVICE Array(T x) { for (int i = 0; i < size_; i++) { @@ -36,4 +39,4 @@ struct Array { } }; -}} +} // namespace at::detail diff --git a/aten/src/ATen/core/CachingHostAllocator.h b/aten/src/ATen/core/CachingHostAllocator.h new file mode 100644 index 0000000000000..d04cb1c6b8a70 --- /dev/null +++ b/aten/src/ATen/core/CachingHostAllocator.h @@ -0,0 +1,380 @@ +#include +#include +#include +#include + +#include +#include +#include + +namespace at { + +/** + * HostBlock is typically a fundamental memory block used in pinned memory. It + * is likely related to Event and Stream of device runtime. It is probably a + * base struct or interface that can be inherited and extended by each backend. + */ +template +struct HostBlock { + // constructor for search key + HostBlock(size_t size) : size_(size) {} + + HostBlock(size_t size, void* ptr) : size_(size), ptr_(ptr) {} + + std::mutex mutex_; + size_t size_{0}; // block size in bytes + void* ptr_{nullptr}; // memory address + bool allocated_{false}; // in-use flag + size_t event_count_{0}; // number of related events + ska::flat_hash_set streams_; // streams on which the block was used +}; + +/** + * ComparatorSize is used for lookup support in the set of host memory blocks + * using the block size. + */ +template +struct ComparatorSize { + bool operator()(const B* a, const B* b) const { + if (a->size_ != b->size_) { + return a->size_ < b->size_; + } + return (uintptr_t)a->ptr_ < (uintptr_t)b->ptr_; + } +}; + +/** + * Note [HostAllocator design] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * We have three key data structures - the free list which stores blocks that + * are not currently used, the block list which stores all blocks that have been + * allocated, and the event queue which stores runtime events and their + * corresponding blocks. + * + * Each of these are protected by a separate mutex. The key design principles + * are to 1) only hold each mutex for the minimal amount of time possible, 2) + * never do any possible expensive operations (such as CUDA runtime API calls) + * while holding the lock. + * + * There are four public methods: allocate, free, record_event and empty_cache. + * 1) In the allocate path, we first check to see if we can service our + * request from this free list, and otherwise we create a new block with + * allocate_host_memory. + * 2) In the free path, we insert events (if required) into the event queue, + * and if possible insert our block back into the free list. In allocate, we + * first eagerly query events until we find one that is not ready, and insert + * the corresponding block onto the free list if all the events recorded for a + * block are ready. + * 3) In the record_event path, we simply insert the given stream into the set + * of streams tracked by the specified block. This set of streams is then + * consumed in the free path. + * 4) In the empty_cache path, we flush any available blocks into the free + * list. Remove all element of free list, then remove them from block list and + * release the associated pinned memory allocation via free_block. + * + * We generalize the caching host allocator into two parts: interface and + * implementation. For any new backend looking to integrate with host allocator + * and reuse caching mechanism, these two parts are necessary to be specialized. + * + * For the implementation, we provide a CachingHostAllocatorImpl struct + * to abstract the caching mechanism. Any backend needs to provide a customized + * implementation by specializing its own public functions and the related + * runtime functions. Its template parameter S represents runtime Stream, E + * denotes runtime Event, B indicates the fundamental memory block, and C + * signifies the sorting compartor algorithm for the memory blocks. + * + * For the interface, we provide a CachingHostAllocatorInterface struct as an + * interface. Any backend needs to derive its own host allocator from this + * interface. Its template parameter T refers to an implementation that + * inherited from CachingHostAllocatorImpl. + * + * So this design can share the caching mechanism across each backend, and + * provide flexibility to each backend. A backend can choose to follow this + * implementation or reuse them by extending and overriding them as necessary. + * Taking CUDA as an example, it specializes runtime related functions to reuse + * the caching mechanism. Additionally, it extends the allocator's functionality + * by adding the allocWithCudaHostRegister function to support page-locking the + * memory range used by CUDA. Of course, you can also refer to + * XPUCachingHostAllocator, which is a host caching allocator supported on XPU + * backend, to implement a basic host caching allocator. + * + * Some of the invariants here are less strict than they could be - for example, + * we do not enforce that free(Block* block) => block->event_count == 0. This is + * for compatibility reasons, and we can explore enforcing these in subsequent + * versions. + * + * Note that this caching host allocator does not split larger allocations into + * smaller blocks, unlike the caching device allocator. + */ + +template < + typename S, + typename E, + typename B = HostBlock, + typename C = ComparatorSize> +struct CachingHostAllocatorImpl { + virtual ~CachingHostAllocatorImpl() = default; + + public: + // return data_ptr and block pair. + virtual std::pair allocate(size_t size) { + if (size == 0) { + return {nullptr, nullptr}; + } + + process_events(); + + // First, try to allocate from the free list + auto* block = get_free_block(size); + if (block) { + return {block->ptr_, reinterpret_cast(block)}; + } + + // Round up the allocation to the nearest power of two to improve reuse. + size_t roundSize = c10::llvm::PowerOf2Ceil(size); + void* ptr = nullptr; + allocate_host_memory(roundSize, &ptr); + + // Then, create a new block. + block = new B(roundSize, ptr); + block->allocated_ = true; + + add_allocated_block(block); + return {block->ptr_, reinterpret_cast(block)}; + } + + virtual void free(void* ctx) { + if (!ctx) { + return; + } + + // Note: we can assume that free is correctly paired with alloc, and thus we + // do not need to look up the ctx in blocks_. + auto* block = reinterpret_cast(ctx); + + c10::optional> events; + { + std::lock_guard g(block->mutex_); + block->allocated_ = false; + if (block->streams_.empty()) { + TORCH_INTERNAL_ASSERT(block->event_count_ == 0); + } else { + events = std::vector(); + events->reserve(block->streams_.size()); + for (auto stream : block->streams_) { + record_stream(events, stream); + } + block->event_count_ += events->size(); + block->streams_.clear(); + } + } + + if (!events) { + std::lock_guard g(free_list_mutex_); + free_list_.insert(block); + } else { + // restore these events that record by used streams. + std::lock_guard g(events_mutex_); + for (auto&& event : *events) { + events_.emplace_front(std::move(event), block); + } + } + } + + virtual bool record_event(void* ptr, void* ctx, S stream) { + auto* block = reinterpret_cast(ctx); + + // Note: we need to check if the passed-in `ctx` is valid. This is because + // `record_event` (via `CachingHostAllocator_recordEvent`) can be invoked on + // an arbitrary tensor, and is not guaranteed to correspond to a pinned + // memory allocation. Therefore, we need to check that `ctx` is valid before + // proceeding. + { + std::lock_guard g(blocks_mutex_); + if (blocks_.find(block) != blocks_.end()) { + // Now we know this object is safe to access. + std::lock_guard gb(block->mutex_); + TORCH_INTERNAL_ASSERT(block->allocated_); + block->streams_.insert(stream); + return true; + } + auto it = ptr_to_block_.find(ptr); + if (it != ptr_to_block_.end()) { + block = it->second; + std::lock_guard g(block->mutex_); + TORCH_INTERNAL_ASSERT(block->allocated_); + block->streams_.insert(stream); + return true; + } + } + + return false; + } + + virtual void empty_cache() { + // Flush any available blocks into the free_list. + process_events(); + + // Remove all elements from the free list, remove them from the blocks + // list, and free the associated pinned memory allocation. This requires + // concurrently holding both the free list mutex and the blocks mutex, and + // is the only function that concurrently holds multiple mutexes. + std::lock(free_list_mutex_, blocks_mutex_); + std::lock_guard gf(free_list_mutex_, std::adopt_lock); + std::lock_guard gb(blocks_mutex_, std::adopt_lock); + + std::vector blocks_to_remove(free_list_.begin(), free_list_.end()); + free_list_.clear(); + for (auto* block : blocks_to_remove) { + blocks_.erase(block); + ptr_to_block_.erase(block->ptr_); + free_block(block); + delete block; + } + } + + virtual void copy_data(void* dest, const void* src, std::size_t count) const { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for copy_data"); + } + + private: + virtual void add_allocated_block(B* block) { + std::lock_guard g(blocks_mutex_); + blocks_.insert(block); + ptr_to_block_.insert({block->ptr_, block}); + } + + virtual B* get_free_block(size_t size) { + std::lock_guard g(free_list_mutex_); + B key(size); + auto it = free_list_.lower_bound(&key); + if (it != free_list_.end()) { + B* block = *it; + block->allocated_ = true; + free_list_.erase(it); + return block; + } + return nullptr; + } + + virtual void process_events() { + + while (true) { + // Avoid calling cudaEventDestroy while holding a mutex, so move + // intermediate events out of the lock into this object. + // process the last event + c10::optional> processed; + { + std::lock_guard g(events_mutex_); + if (!events_.empty()) { + processed = std::move(events_.back()); + events_.pop_back(); + } + } + + if (!processed) { + return; + } + + // otherwise, query the event + { + // now, see if we can handle this element + auto& event = processed->first; + if (!query_event(event)) { + // push the event onto the back if it's not ready. + { + std::lock_guard g(events_mutex_); + events_.push_back(std::move(*processed)); + } + return; + } + } + + // Process the events. + TORCH_INTERNAL_ASSERT(processed); + auto* block = processed->second; + bool available = false; + { + std::lock_guard g(block->mutex_); + TORCH_INTERNAL_ASSERT(!block->allocated_) + block->event_count_--; + if (block->event_count_ == 0) { + available = true; + } + } + + if (available) { + std::lock_guard g(free_list_mutex_); + free_list_.insert(block); + } + } + } + + /* These following functions are runtime-related. */ + + // Allocate page-locked memory on the host. + virtual void allocate_host_memory(size_t size, void** ptr) { + TORCH_CHECK_NOT_IMPLEMENTED( + false, "Not implemented for allocate_host_memory"); + } + + // Free block and release the pointer contained in block. + virtual void free_block(B* block) { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for free_block"); + } + + // Record an event on stream and store event into events. + virtual void record_stream(c10::optional>& events, S stream) { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for record_stream"); + } + + // Query event if it is completed. + virtual bool query_event(E& event) { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for query_event"); + } + + alignas(64) std::mutex blocks_mutex_; + ska::flat_hash_set blocks_; // block list + ska::flat_hash_map ptr_to_block_; + + // Note: sharding this mutex seems to be profitable in heavily multi-threaded + // scenarios. + alignas(64) std::mutex free_list_mutex_; + // Note: an alternative datastructure can yield significant wins here in + // microbenchmarks. + std::set free_list_; // free list + + alignas(64) std::mutex events_mutex_; + std::deque> events_; // event queue paired with block +}; + +template +struct CachingHostAllocatorInterface : public at::Allocator { + CachingHostAllocatorInterface() :impl_(std::make_unique()) {} + + at::DataPtr allocate(size_t size) override { + TORCH_CHECK_NOT_IMPLEMENTED(false, "Not implemented for allocate"); + } + + void free(void* ctx) { + impl_->free(ctx); + } + + template + bool record_event(void* ptr, void* ctx, S stream) { + return impl_->record_event(ptr, ctx, stream); + } + + void empty_cache() { + impl_->empty_cache(); + } + + void copy_data(void* dest, const void* src, std::size_t count) + const override { + impl_->copy_data(dest, src, count); + } + + std::unique_ptr impl_; +}; + +} // namespace at diff --git a/aten/src/ATen/core/CheckMemoryFormat.h b/aten/src/ATen/core/CheckMemoryFormat.h index 3d1712a2ff19b..442889e2eec6f 100644 --- a/aten/src/ATen/core/CheckMemoryFormat.h +++ b/aten/src/ATen/core/CheckMemoryFormat.h @@ -1,6 +1,6 @@ #include -namespace c10 { namespace impl { +namespace c10::impl { inline c10::optional check_tensor_options_and_extract_memory_format( @@ -22,4 +22,4 @@ check_tensor_options_and_extract_memory_format( } } -}} // namespace impl namespace c10 +} // namespace impl namespace c10 diff --git a/aten/src/ATen/core/DeprecatedTypeProperties.h b/aten/src/ATen/core/DeprecatedTypeProperties.h index b77b09d595d3c..222465eac56f2 100644 --- a/aten/src/ATen/core/DeprecatedTypeProperties.h +++ b/aten/src/ATen/core/DeprecatedTypeProperties.h @@ -94,6 +94,10 @@ class TORCH_API DeprecatedTypeProperties { return toBackend(Backend::HIP); } + DeprecatedTypeProperties & privateUser1() const { + return toBackend(Backend::PrivateUse1); + } + /// Constructs the `TensorOptions` from a type and a `device_index`. TensorOptions options(int16_t device_index = -1) const { return TensorOptions().dtype(typeMeta()) diff --git a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h index a21f1abbe97f4..78f0cfdfa5530 100644 --- a/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h +++ b/aten/src/ATen/core/DeprecatedTypePropertiesRegistry.h @@ -5,6 +5,7 @@ #include #include +#include namespace at { @@ -21,6 +22,7 @@ class TORCH_API DeprecatedTypePropertiesRegistry { DeprecatedTypeProperties& getDeprecatedTypeProperties(Backend p, ScalarType s) const; private: + // NOLINTNEXTLINE(*c-array*) std::unique_ptr registry [static_cast(Backend::NumOptions)] [static_cast(ScalarType::NumOptions)]; diff --git a/aten/src/ATen/core/Dict.cpp b/aten/src/ATen/core/Dict.cpp index 3721ad25c9b92..fb49f75d63cd7 100644 --- a/aten/src/ATen/core/Dict.cpp +++ b/aten/src/ATen/core/Dict.cpp @@ -1,7 +1,7 @@ #include -namespace c10 { -namespace detail { + +namespace c10::detail { bool operator==(const DictImpl& lhs, const DictImpl& rhs) { bool isEqualFastChecks = *lhs.elementTypes.keyType == *rhs.elementTypes.keyType && @@ -25,5 +25,4 @@ bool operator==(const DictImpl& lhs, const DictImpl& rhs) { return true; } -} // namespace detail -} // namespace c10 +} // namespace c10::detail diff --git a/aten/src/ATen/core/Dict.h b/aten/src/ATen/core/Dict.h index c4fb44ce0c636..964b4a152b5ae 100644 --- a/aten/src/ATen/core/Dict.h +++ b/aten/src/ATen/core/Dict.h @@ -207,7 +207,7 @@ template Dict toGenericDict(Dict class Dict final { private: - static_assert((std::is_same::value && std::is_same::value) || guts::typelist::contains::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string."); + static_assert((std::is_same_v && std::is_same_v) || guts::typelist::contains::value, "Invalid Key type for Dict. We only support int64_t, double, bool, and string."); // impl_ stores the underlying map as a ska_ordered::order_preserving_flat_hash_map. // We intentionally don't offer conversion from/to diff --git a/aten/src/ATen/core/Dict_inl.h b/aten/src/ATen/core/Dict_inl.h index 69f6791d91cac..0419b3bd49e91 100644 --- a/aten/src/ATen/core/Dict_inl.h +++ b/aten/src/ATen/core/Dict_inl.h @@ -120,9 +120,9 @@ template std::pair::iterator, bool> Dict::insert(Key_&& key, Value_&& value) const { static_assert(std::is_constructible::value, "Wrong type for the key argument of Dict::insert"); static_assert(std::is_constructible::value, "Wrong type for the value argument of Dict::insert"); - auto inserted = impl_->dict.insert(std::pair{ - Key(std::forward(key)), - Value(std::forward(value))}); + auto inserted = impl_->dict.emplace( + Key(std::forward(key)), + Value(std::forward(value))); return {iterator{inserted.first}, inserted.second}; } diff --git a/aten/src/ATen/core/Dimname.cpp b/aten/src/ATen/core/Dimname.cpp index b39b7f00b5c66..47526b6511edd 100644 --- a/aten/src/ATen/core/Dimname.cpp +++ b/aten/src/ATen/core/Dimname.cpp @@ -20,7 +20,7 @@ bool Dimname::isValidName(const std::string& name) { // letters A through Z, the underscore _ and, except for the first // character, the digits 0 through 9" (at least length 1) // https://docs.python.org/3/reference/lexical_analysis.html#identifiers - if (name.length() == 0) { + if (name.empty()) { return false; } for (auto it = name.begin(); it != name.end(); ++it) { diff --git a/aten/src/ATen/core/Formatting.cpp b/aten/src/ATen/core/Formatting.cpp index 957b89c7a1f16..824640705238a 100644 --- a/aten/src/ATen/core/Formatting.cpp +++ b/aten/src/ATen/core/Formatting.cpp @@ -72,7 +72,7 @@ static std::tuple __printFormat(std::ostream& stream, const Tensor& return std::make_tuple(1., 0); } bool intMode = true; - auto self_p = self.data_ptr(); + auto self_p = self.const_data_ptr(); for (const auto i : c10::irange(size)) { auto z = self_p[i]; if(std::isfinite(z)) { @@ -160,7 +160,7 @@ static void __printIndent(std::ostream &stream, int64_t indent) static void printScale(std::ostream & stream, double scale) { FormatGuard guard(stream); - stream << defaultfloat << scale << " *" << std::endl; + stream << defaultfloat << scale << " *" << '\n'; } static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t linesize, int64_t indent) { @@ -178,7 +178,7 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line } if(nColumnPerLine < self.size(1)) { if(firstColumn != 0) { - stream << std::endl; + stream << '\n'; } stream << "Columns " << firstColumn+1 << " to " << lastColumn+1; __printIndent(stream, indent); @@ -189,11 +189,11 @@ static void __printMatrix(std::ostream& stream, const Tensor& self, int64_t line } for (const auto l : c10::irange(self.size(0))) { Tensor row = self.select(0,l); - double *row_ptr = row.data_ptr(); + const double *row_ptr = row.const_data_ptr(); for (const auto c : c10::irange(firstColumn, lastColumn+1)) { stream << std::setw(sz) << row_ptr[c]/scale; if(c == lastColumn) { - stream << std::endl; + stream << '\n'; if(l != self.size(0)-1) { if(scale != 1) { __printIndent(stream, indent); @@ -239,7 +239,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) if(start) { start = false; } else { - stream << std::endl; + stream << '\n'; } stream << "("; Tensor tensor = self; @@ -247,7 +247,7 @@ static void __printTensor(std::ostream& stream, Tensor& self, int64_t linesize) tensor = tensor.select(0, counter[i]); stream << counter[i]+1 << ","; } - stream << ".,.) = " << std::endl; + stream << ".,.) = " << '\n'; __printMatrix(stream, tensor, linesize, 1); } } @@ -279,7 +279,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi tensor = tensor_.to(kCPU, kDouble).contiguous(); } if(tensor.ndimension() == 0) { - stream << defaultfloat << tensor.data_ptr()[0] << std::endl; + stream << defaultfloat << tensor.const_data_ptr()[0] << '\n'; stream << "[ " << tensor_.toString() << "{}"; } else if(tensor.ndimension() == 1) { if (tensor.numel() > 0) { @@ -287,9 +287,9 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi if(scale != 1) { printScale(stream, scale); } - double* tensor_p = tensor.data_ptr(); + const double* tensor_p = tensor.const_data_ptr(); for (const auto i : c10::irange(tensor.size(0))) { - stream << std::setw(sz) << tensor_p[i]/scale << std::endl; + stream << std::setw(sz) << tensor_p[i]/scale << '\n'; } } stream << "[ " << tensor_.toString() << "{" << tensor.size(0) << "}"; @@ -329,7 +329,7 @@ std::ostream& print(std::ostream& stream, const Tensor & tensor_, int64_t linesi if (tensor.getIntrusivePtr()->autograd_meta()) { auto& fw_grad = tensor._fw_grad(/* level */ 0); if (fw_grad.defined()) { - stream << ", tangent:" << std::endl << fw_grad; + stream << ", tangent:" << '\n' << fw_grad; } } stream << " ]"; diff --git a/aten/src/ATen/core/Generator.cpp b/aten/src/ATen/core/Generator.cpp index 800f8c7c88ec6..0334161f54e73 100644 --- a/aten/src/ATen/core/Generator.cpp +++ b/aten/src/ATen/core/Generator.cpp @@ -13,4 +13,12 @@ at::Tensor Generator::get_state() const { return at::Tensor::wrap_tensor_impl(this->impl_->get_state()); } +void Generator::graphsafe_set_state(const Generator& new_state) { + this->impl_->graphsafe_set_state(new_state.getIntrusivePtr()); +} + +Generator Generator::graphsafe_get_state() const { + return Generator(this->impl_->graphsafe_get_state()); +} + } // namespace at diff --git a/aten/src/ATen/core/Generator.h b/aten/src/ATen/core/Generator.h index 36f2eac9667fc..b237c571b22d3 100644 --- a/aten/src/ATen/core/Generator.h +++ b/aten/src/ATen/core/Generator.h @@ -1,15 +1,11 @@ #pragma once -#include +#include #include -#include -#include +#include #include -#include -#include #include -#include #include #include #include @@ -111,6 +107,10 @@ struct TORCH_API Generator { at::Tensor get_state() const; + void graphsafe_set_state(const Generator& new_state); + + Generator graphsafe_get_state() const; + std::mutex& mutex() { return impl_->mutex_; } diff --git a/aten/src/ATen/core/IListRef.h b/aten/src/ATen/core/IListRef.h index 2bbf43975eacc..01e52f52f684c 100644 --- a/aten/src/ATen/core/IListRef.h +++ b/aten/src/ATen/core/IListRef.h @@ -307,10 +307,10 @@ class IListRefTagImplBase {}; * reference type, then it's left unchanged. */ template -using _MaterializedIListRefElem = typename std::conditional< - std::is_reference::value, - typename std::reference_wrapper::type>, - T>::type; +using _MaterializedIListRefElem = std::conditional_t< + std::is_reference_v, + typename std::reference_wrapper>, + T>; template using MaterializedIListRefElem = _MaterializedIListRefElem>; @@ -540,7 +540,7 @@ class IListRef { template < typename... UnboxedConstructorArgs, typename = std::enable_if_t< - std::is_constructible::value>> + std::is_constructible_v>> IListRef(UnboxedConstructorArgs&&... args) : tag_(IListRefTag::Unboxed) { payload_.unboxed = unboxed_type(std::forward(args)...); } diff --git a/aten/src/ATen/core/IListRef_inl.h b/aten/src/ATen/core/IListRef_inl.h index 534272f69b64f..34673d6bf2b24 100644 --- a/aten/src/ATen/core/IListRef_inl.h +++ b/aten/src/ATen/core/IListRef_inl.h @@ -8,8 +8,8 @@ class Tensor; class OptionalTensorRef; } -namespace c10 { -namespace detail { + +namespace c10::detail { /* * Specializations of `IListRefTagImplBase` that implement the default @@ -184,8 +184,8 @@ class IListRefTagImpl at::OptionalTensorRef, MaterializedIListRefElem> {}; -} // namespace detail -} // namespace c10 +} // namespace c10::detail + namespace at { diff --git a/aten/src/ATen/core/IListRef_test.cpp b/aten/src/ATen/core/IListRef_test.cpp index 0530dea5f28b5..3fcb3858e657f 100644 --- a/aten/src/ATen/core/IListRef_test.cpp +++ b/aten/src/ATen/core/IListRef_test.cpp @@ -103,7 +103,7 @@ TEST(ITensorListRefTest, Boxed_GetConstRefTensor) { const List boxed(vec); at::ITensorListRef list(boxed); static_assert( - std::is_same::value, + std::is_same_v, "Accessing elements from List through a ITensorListRef should be const references."); EXPECT_TRUE(boxed[0].is_same(*list.begin())); EXPECT_TRUE(boxed[1].is_same(*(++list.begin()))); @@ -113,7 +113,7 @@ TEST(ITensorListRefTest, Unboxed_GetConstRefTensor) { auto vec = get_tensor_vector(); at::ITensorListRef list(vec); static_assert( - std::is_same::value, + std::is_same_v, "Accessing elements from ArrayRef through a ITensorListRef should be const references."); EXPECT_TRUE(vec[0].is_same(*list.begin())); EXPECT_TRUE(vec[1].is_same(*(++list.begin()))); diff --git a/aten/src/ATen/core/List.cpp b/aten/src/ATen/core/List.cpp index 6fb9b11ef156f..a9f041517062e 100644 --- a/aten/src/ATen/core/List.cpp +++ b/aten/src/ATen/core/List.cpp @@ -1,7 +1,7 @@ #include -namespace c10 { -namespace detail { + +namespace c10::detail { bool operator==(const ListImpl& lhs, const ListImpl& rhs) { return *lhs.elementType == *rhs.elementType && lhs.list.size() == rhs.list.size() && @@ -16,5 +16,4 @@ bool operator==(const ListImpl& lhs, const ListImpl& rhs) { ListImpl::ListImpl(list_type list_, TypePtr elementType_) : list(std::move(list_)) , elementType(std::move(elementType_)) {} -} // namespace detail -} // namespace c10 +} // namespace c10::detail diff --git a/aten/src/ATen/core/List.h b/aten/src/ATen/core/List.h index d1271dadec2ac..68ecf5ed343f8 100644 --- a/aten/src/ATen/core/List.h +++ b/aten/src/ATen/core/List.h @@ -44,7 +44,7 @@ template class ListIterator; template class ListElementReference; template -void swap(ListElementReference&& lhs, ListElementReference&& rhs); +void swap(ListElementReference&& lhs, ListElementReference&& rhs) noexcept; template bool operator==(const ListElementReference& lhs, const T& rhs); @@ -68,8 +68,8 @@ template class ListElementReference final { public: operator std::conditional_t< - std::is_reference::type>::value, + std::is_reference_v::type>, const T&, T>() const; @@ -84,7 +84,7 @@ class ListElementReference final { return *iterator_; } - friend void swap(ListElementReference&& lhs, ListElementReference&& rhs); + friend void swap(ListElementReference&& lhs, ListElementReference&& rhs) noexcept; ListElementReference(const ListElementReference&) = delete; ListElementReference& operator=(const ListElementReference&) = delete; @@ -285,7 +285,7 @@ class List final { * Returns the element at specified location pos, with bounds checking. * If pos is not within the range of the container, an exception of type std::out_of_range is thrown. */ - value_type get(size_type pos) const; + internal_const_reference_type get(size_type pos) const; /** * Moves out the element at the specified location pos and returns it, with bounds checking. diff --git a/aten/src/ATen/core/List_inl.h b/aten/src/ATen/core/List_inl.h index 0fb911278a919..f8ce73eb3f9cc 100644 --- a/aten/src/ATen/core/List_inl.h +++ b/aten/src/ATen/core/List_inl.h @@ -120,8 +120,8 @@ namespace impl { template ListElementReference::operator std::conditional_t< - std::is_reference::type>::value, + std::is_reference_v::type>, const T&, T>() const { return iterator_->template to(); @@ -146,7 +146,7 @@ ListElementReference& ListElementReference::operator=( } template -void swap(ListElementReference&& lhs, ListElementReference&& rhs) { +void swap(ListElementReference&& lhs, ListElementReference&& rhs) noexcept { std::swap(*lhs.iterator_, *rhs.iterator_); } @@ -186,8 +186,8 @@ void List::set(size_type pos, value_type&& value) const { } template -typename List::value_type List::get(size_type pos) const { - return c10::detail::list_element_to(impl_->list.at(pos)); +typename List::internal_const_reference_type List::get(size_type pos) const { + return operator[](pos); } template diff --git a/aten/src/ATen/core/List_test.cpp b/aten/src/ATen/core/List_test.cpp index 1fe14309a9330..56da3cf299e90 100644 --- a/aten/src/ATen/core/List_test.cpp +++ b/aten/src/ATen/core/List_test.cpp @@ -1118,7 +1118,7 @@ TEST(ListTestNonIValueBasedList, sameValueDifferentStorage_thenIsReturnsFalse) { TEST(ListTest, canAccessStringByReference) { List list({"one", "two"}); const auto& listRef = list; - static_assert(std::is_same::value, + static_assert(std::is_same_v, "const List access should be by const reference"); std::string str = list[1]; const std::string& strRef = listRef[1]; @@ -1130,7 +1130,7 @@ TEST(ListTest, canAccessOptionalStringByReference) { List> list({"one", "two", c10::nullopt}); const auto& listRef = list; static_assert( - std::is_same>>::value, + std::is_same_v>>, "List> access should be by const reference"); c10::optional str1 = list[1]; c10::optional str2 = list[2]; @@ -1148,7 +1148,7 @@ TEST(ListTest, canAccessTensorByReference) { List list; const auto& listRef = list; static_assert( - std::is_same::value, + std::is_same_v, "List access should be by const reference"); } diff --git a/aten/src/ATen/core/MetaFallbackKernel.cpp b/aten/src/ATen/core/MetaFallbackKernel.cpp index fe56568bbbcd1..8523a55878103 100644 --- a/aten/src/ATen/core/MetaFallbackKernel.cpp +++ b/aten/src/ATen/core/MetaFallbackKernel.cpp @@ -8,14 +8,14 @@ static void metaFallback( const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) { - c10::Dispatcher::singleton().throwIfHasAbstractImplPyStub(op.operator_name()); + c10::Dispatcher::singleton().throwIfHasPythonModule(op.operator_name()); TORCH_CHECK_NOT_IMPLEMENTED( false, op.operator_name(), ": attempted to run this operator with Meta tensors, but there was no ", - "abstract impl or Meta kernel registered. You may have run into this message " + "fake impl or Meta kernel registered. You may have run into this message " "while using an operator with PT2 compilation APIs (torch.compile/torch.export); " - "in order to use this operator with those APIs you'll need to add an abstract impl." + "in order to use this operator with those APIs you'll need to add a fake impl." "Please see the following doc for next steps: " "https://docs.google.com/document/d/1_W62p8WJOQQUzPsJYa7s701JXt0qf2OfLub2sbkHOaU/edit"); } diff --git a/aten/src/ATen/core/NamedTensor.cpp b/aten/src/ATen/core/NamedTensor.cpp index 846178f5a00d1..b224a7c9608cc 100644 --- a/aten/src/ATen/core/NamedTensor.cpp +++ b/aten/src/ATen/core/NamedTensor.cpp @@ -87,8 +87,8 @@ void check_names_valid_for(TensorImpl* impl, DimnameList names) { void internal_set_names_inplace(TensorImpl* impl, optional names, bool validate_names) { TORCH_CHECK(impl->layout() == Layout::Strided, "NYI: named tensors only support strided layout"); - TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_privateuseone(), - "NYI: named tensors only support CPU, CUDA or ", c10::get_privateuse1_backend(), " tensors."); + TORCH_CHECK(impl->device().is_cpu() || impl->device().is_cuda() || impl->device().is_xpu() || impl->device().is_privateuseone(), + "NYI: named tensors only support CPU, CUDA, XPU or ", c10::get_privateuse1_backend(), " tensors."); if (!names) { impl->set_named_tensor_meta(nullptr); return; @@ -121,9 +121,9 @@ void internal_set_names_inplace(TensorImpl* impl, std::vector&& names, } auto* meta = get_named_tensor_meta(impl); if (meta == nullptr) { - impl->set_named_tensor_meta(std::make_unique(NamedTensorMeta::HasNonWildcard, names)); + impl->set_named_tensor_meta(std::make_unique(NamedTensorMeta::HasNonWildcard, std::move(names))); } else { - meta->set_names(NamedTensorMeta::HasNonWildcard, names); + meta->set_names(NamedTensorMeta::HasNonWildcard, std::move(names)); } } diff --git a/aten/src/ATen/core/NamedTensor.h b/aten/src/ATen/core/NamedTensor.h index 73a0d7d02551b..d6ff30ce00838 100644 --- a/aten/src/ATen/core/NamedTensor.h +++ b/aten/src/ATen/core/NamedTensor.h @@ -2,7 +2,6 @@ #include #include -#include namespace at { @@ -45,7 +44,7 @@ struct TORCH_API NamedTensorMeta final : public c10::NamedTensorMetaInterface { // Used for an assertion in TensorImpl.h int64_t slow_dim() const override { - return names_.size(); + return static_cast(names_.size()); } void check_invariants() const { @@ -80,7 +79,7 @@ struct TORCH_API NamesMode { // A RAII, thread local (!) guard that enables or disables names upon // construction, and sets it back to the original value upon destruction. struct TORCH_API NoNamesGuard { - NoNamesGuard() : prev_mode(NamesMode::is_enabled()), initialized(true) { + NoNamesGuard() : prev_mode(NamesMode::is_enabled()) { NamesMode::set_enabled(false); } ~NoNamesGuard() { @@ -94,7 +93,7 @@ struct TORCH_API NoNamesGuard { } private: bool prev_mode; - bool initialized; + bool initialized{true}; }; void check_names_valid_for(const TensorBase& tensor, DimnameList names); diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.cpp b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp new file mode 100644 index 0000000000000..b703f76773b46 --- /dev/null +++ b/aten/src/ATen/core/NestedIntSymNodeImpl.cpp @@ -0,0 +1,80 @@ +#include +#include +#include + +namespace c10 { + +namespace { +bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) { + TORCH_INTERNAL_ASSERT(lhs->is_nested_int()); + c10::optional c = rhs->nested_int(); + return ( + c.has_value() && lhs->nested_int() == *c && + lhs->nested_int_coeff() == rhs->nested_int_coeff()); +} +bool _ge(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) { + if (auto mb_si = lhs->nested_int()) { + if (auto mb_si2 = rhs->nested_int()) { + if (*mb_si == *mb_si2) { + return lhs->nested_int_coeff() >= rhs->nested_int_coeff(); + } + TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate"); + } + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + if (rhs->constant_int() && *rhs->constant_int() <= 2) { + return true; + } + TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate"); + } else if (rhs->nested_int()) { + // NOLINTNEXTLINE(bugprone-unchecked-optional-access) + if (lhs->constant_int() && *lhs->constant_int() < 2) { + return false; + } + TORCH_CHECK(false, "nested int ", op, ": Relation is indeterminate"); + } + TORCH_INTERNAL_ASSERT(false, "expect at least one nested int"); +} +} // namespace + +c10::SymNode NestedIntSymNodeImpl::eq(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + _eq("eq", this, other.get()))); +} + +c10::SymNode NestedIntSymNodeImpl::ne(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + !_eq("ne", this, other.get()))); +} + +c10::SymNode NestedIntSymNodeImpl::ge(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + _ge("ge", this, other.get()))); +} + +c10::SymNode NestedIntSymNodeImpl::gt(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + !_ge("gt", other.get(), this))); +} + +c10::SymNode NestedIntSymNodeImpl::lt(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + !_ge("lt", this, other.get()))); +} + +c10::SymNode NestedIntSymNodeImpl::le(const c10::SymNode& other) { + return SymNode(c10::make_intrusive>( + _ge("le", other.get(), this))); +} + +c10::SymNode NestedIntSymNodeImpl::mul(const c10::SymNode& other) { + TORCH_CHECK(!other->nested_int(), "nested int cannot be multiplied by nested int"); + c10::optional c = other->constant_int(); + TORCH_CHECK(c.has_value()); + return SymNode(c10::make_intrusive(val_, coeff_ * *c)); +} + +c10::SymNode NestedIntSymNodeImpl::clone() { + return SymNode(c10::make_intrusive(val_, coeff_)); +} + +} // namespace c10 diff --git a/aten/src/ATen/core/NestedIntSymNodeImpl.h b/aten/src/ATen/core/NestedIntSymNodeImpl.h new file mode 100644 index 0000000000000..228f4310a38fc --- /dev/null +++ b/aten/src/ATen/core/NestedIntSymNodeImpl.h @@ -0,0 +1,187 @@ +#pragma once + +#include +#include +#include +#include +#include +#include +#include +#include + +namespace c10 { + +// The motivating usecase for this is to represent the ragged size structure +// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This +// allows us to simply return [B, j0, D] if someone queries for the size of our +// tensor. +// +// Morally we define comparison between two nested ints to return true if +// that comparison holds for all corresponding elements of the arrays they +// represent. Comparison between a nested int and a plain int is defined +// similarly. +// +// To simulate this desired behavior but also avoid the O(N) cost of checking, +// we associate each raggedness pattern with an integer "id" that can be used as +// a proxy to evaluate equality. We also constrain the range of values for this +// as to enable inequality checks. +// +// We also support a positive integer scalar "coeff" that is used for computing +// strides. For example given, a [B, j0, D] tensor, it can be strided in two +// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to +// differentiate the two cases. +// +// During tracing the strides of the outputs need to be a function of the size +// and strides of the inputs so it is important that NestedIntSymNode itself is +// able to express this. +class TORCH_API NestedIntSymNodeImpl : public SymNodeImpl { + public: + // CAUTION: you should probably not be constructing these directly; please + // the higher-level API in python instead (TODO: actually introduce that). + explicit NestedIntSymNodeImpl(int64_t val, int64_t coeff) + : val_(val), coeff_(coeff) {} + + bool bool_() override { + return false; + } + + bool is_int() override { + return true; + } + + bool is_float() override { + return false; + } + + bool is_bool() override { + return false; + } + + bool is_nested_int() const override { + return true; + } + + bool has_hint() override { + return true; + } + + c10::SymNode wrap_int(int64_t num) override { + return SymNode(c10::make_intrusive>(num)); + }; + + int64_t guard_int(const char* file, int64_t line) override { + TORCH_CHECK(false); + } + + double guard_float(const char* file, int64_t line) override { + TORCH_CHECK(false, "not a float"); + } + + bool guard_bool(const char* file, int64_t line) override { + TORCH_CHECK(false, "not a bool"); + } + + int64_t int_() override { + TORCH_CHECK(false); + } + + std::string str() override { + if (coeff_ == 1) { + return "j" + std::to_string(val_); + } + return std::to_string(coeff_) + "*j" + std::to_string(val_); + } + + // NOTE [ Inequalities with nested int ] + // + // The semantics of nested int when it comes to relations is that it is + // treated as integer known to be within a certain range, + // + // j0 \in [2, int64_t::max] + // + // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False). + // This is a useful default range for the raggedness pattern of a jagged + // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1 + // specialization checks. + // + // [ Indeterminate inequalities error out ] + // + // Given the semantic defined above, certain relations like j0 < 3 are thus + // indeterminable. In our impl today, evaluating such relations error + // + // It may seem convenient to just define indeterminate relations to return + // False, but the implementation we maintain in parallel using sympy does not + // allow this. + // + // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are, + // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This + // would mean that means that if we define the indeterminate j0 >= 3 to be + // False, the also indeterminate j0 < 3 will be evaluated to be True! + // + // [ Coefficient are assumed positive ] + // + // For the purpose of computing inequalities, we consider the coefficient of + // the nested int to be a positive integer. + // + // Thus, no modifications are needed to the logic since + // j0 >= k implies coeff * j0 >= k + // + c10::SymNode eq(const c10::SymNode& other) override; + c10::SymNode ne(const c10::SymNode& other) override; + c10::SymNode ge(const c10::SymNode& other) override; + c10::SymNode gt(const c10::SymNode& other) override; + c10::SymNode lt(const c10::SymNode& other) override; + c10::SymNode le(const c10::SymNode& other) override; + c10::SymNode mul(const c10::SymNode& other) override; + + c10::optional nested_int() override { + return val_; + } + + c10::optional nested_int_coeff() override { + return coeff_; + } + + bool is_symbolic() override { + return false; + } + + c10::SymNode clone() override; + +#define DEFINE_BINARY_NOT_SUPPORTED(name) \ + c10::SymNode name(const c10::SymNode& other) override { \ + TORCH_CHECK(false, #name " not supported by NestedIntSymNode"); \ + } + + DEFINE_BINARY_NOT_SUPPORTED(add) + DEFINE_BINARY_NOT_SUPPORTED(sub) + DEFINE_BINARY_NOT_SUPPORTED(truediv) + DEFINE_BINARY_NOT_SUPPORTED(pow) + DEFINE_BINARY_NOT_SUPPORTED(floordiv) + DEFINE_BINARY_NOT_SUPPORTED(mod) + DEFINE_BINARY_NOT_SUPPORTED(sym_min) + DEFINE_BINARY_NOT_SUPPORTED(sym_max) + DEFINE_BINARY_NOT_SUPPORTED(sym_and) + DEFINE_BINARY_NOT_SUPPORTED(sym_or) + +#undef DEFINE_BINARY_NOT_SUPPORTED + +#define DEFINE_NOT_SUPPORTED(name) \ + c10::SymNode name() override { \ + TORCH_CHECK(false, #name " is not supported by NestedIntSymNode"); \ + } + + DEFINE_NOT_SUPPORTED(sym_not) + DEFINE_NOT_SUPPORTED(ceil) + DEFINE_NOT_SUPPORTED(floor) + DEFINE_NOT_SUPPORTED(neg) + DEFINE_NOT_SUPPORTED(sym_float) + +#undef DEFINE_NOT_SUPPORTED + + private: + int64_t val_; + int64_t coeff_; +}; + +} // namespace c10 diff --git a/aten/src/ATen/core/PythonFallbackKernel.cpp b/aten/src/ATen/core/PythonFallbackKernel.cpp index 4758942c529b9..a34341b4a9437 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.cpp +++ b/aten/src/ATen/core/PythonFallbackKernel.cpp @@ -120,8 +120,8 @@ void preDispatchFallback(const c10::OperatorHandle& op, c10::DispatchKeySet disp } // anonymous namespace -namespace at { -namespace impl { + +namespace at::impl { RestorePythonTLSSnapshot::RestorePythonTLSSnapshot() : saved_(safe_get_tls_on_entry()), guard_(safe_get_tls_on_entry()) { tls_on_entry = c10::nullopt; @@ -148,8 +148,7 @@ MaybeSetTLSOnEntryGuard::~MaybeSetTLSOnEntryGuard() { } -} // namespace impl -} // namespace at +} // namespace at::impl TORCH_LIBRARY_IMPL(_, Python, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&pythonFallback>()); diff --git a/aten/src/ATen/core/PythonFallbackKernel.h b/aten/src/ATen/core/PythonFallbackKernel.h index f38bdd2ada90a..67f24795eeb58 100644 --- a/aten/src/ATen/core/PythonFallbackKernel.h +++ b/aten/src/ATen/core/PythonFallbackKernel.h @@ -1,8 +1,8 @@ #pragma once #include -namespace at { -namespace impl { + +namespace at::impl { struct TORCH_API RestorePythonTLSSnapshot { RestorePythonTLSSnapshot(); @@ -24,5 +24,4 @@ struct TORCH_API MaybeSetTLSOnEntryGuard { bool value_set_; }; -} // namespace impl -} // namespace at +} // namespace at::impl diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp index 69f3fcc644f0f..219d774de3a54 100644 --- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp +++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.cpp @@ -1,7 +1,6 @@ #include -namespace at { -namespace impl { +namespace at::impl { // The strategy is that all python interpreters attempt to register themselves // as the main interpreter, but only one wins. Only that interpreter is @@ -9,14 +8,15 @@ namespace impl { // logic on that interpreter, we do so hermetically, never setting pyobj field // on Tensor. -std::atomic PythonOpRegistrationTrampoline::interpreter_{nullptr}; +std::atomic + PythonOpRegistrationTrampoline::interpreter_{nullptr}; c10::impl::PyInterpreter* PythonOpRegistrationTrampoline::getInterpreter() { return PythonOpRegistrationTrampoline::interpreter_.load(); - } -bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterpreter* interp) { +bool PythonOpRegistrationTrampoline::registerInterpreter( + c10::impl::PyInterpreter* interp) { c10::impl::PyInterpreter* expected = nullptr; interpreter_.compare_exchange_strong(expected, interp); if (expected != nullptr) { @@ -29,5 +29,4 @@ bool PythonOpRegistrationTrampoline::registerInterpreter(c10::impl::PyInterprete } } -} // namespace impl -} // namespace at +} // namespace at::impl diff --git a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h index b1a2b30685f30..bec323c7d25bf 100644 --- a/aten/src/ATen/core/PythonOpRegistrationTrampoline.h +++ b/aten/src/ATen/core/PythonOpRegistrationTrampoline.h @@ -4,8 +4,8 @@ // TODO: this can probably live in c10 -namespace at { -namespace impl { + +namespace at::impl { class TORCH_API PythonOpRegistrationTrampoline final { static std::atomic interpreter_; @@ -19,5 +19,4 @@ class TORCH_API PythonOpRegistrationTrampoline final { static c10::impl::PyInterpreter* getInterpreter(); }; -} // namespace impl -} // namespace at +} // namespace at::impl diff --git a/aten/src/ATen/core/QuantizerBase.h b/aten/src/ATen/core/QuantizerBase.h index b6031f0d77983..0d2eaeece8898 100644 --- a/aten/src/ATen/core/QuantizerBase.h +++ b/aten/src/ATen/core/QuantizerBase.h @@ -37,6 +37,7 @@ using QuantizerPtr = c10::intrusive_ptr; * share the same Quantizer. Quantizer should be immutable. */ struct TORCH_API Quantizer : public c10::intrusive_ptr_target { + // NOLINTNEXTLINE(cppcoreguidelines-avoid-const-or-ref-data-members) const ScalarType scalar_type_; explicit Quantizer(ScalarType scalar_type) : scalar_type_(scalar_type) {} ~Quantizer() override; diff --git a/aten/src/ATen/core/Reduction.h b/aten/src/ATen/core/Reduction.h index 23c6ea3cabefb..340e9f91ae8f7 100644 --- a/aten/src/ATen/core/Reduction.h +++ b/aten/src/ATen/core/Reduction.h @@ -1,16 +1,14 @@ #pragma once -namespace at { -namespace Reduction { +namespace at::Reduction { // NB: Keep this in sync with Reduction class in torch/nn/_reduction.py // These constants control the reduction behavior of loss functions. // Ideally, this would be a scoped enum, but jit doesn't support that enum Reduction { - None, // Do not reduce - Mean, // (Possibly weighted) mean of losses - Sum, // Sum losses + None, // Do not reduce + Mean, // (Possibly weighted) mean of losses + Sum, // Sum losses END }; -} // namespace Reduction -} // namespace at +} // namespace at::Reduction diff --git a/aten/src/ATen/core/SingletonSymNodeImpl.cpp b/aten/src/ATen/core/SingletonSymNodeImpl.cpp deleted file mode 100644 index 3ac668d987825..0000000000000 --- a/aten/src/ATen/core/SingletonSymNodeImpl.cpp +++ /dev/null @@ -1,78 +0,0 @@ -#include -#include -#include - -namespace c10 { - -namespace { -bool _eq(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) { - TORCH_INTERNAL_ASSERT(lhs->singleton_int().has_value()); - c10::optional c = rhs->singleton_int(); - return ( - c.has_value() && lhs->singleton_int() == *c && - lhs->singleton_coeff() == rhs->singleton_coeff()); -} -bool _ge(const char* op, c10::SymNodeImpl* lhs, c10::SymNodeImpl* rhs) { - if (auto mb_si = lhs->singleton_int()) { - if (auto mb_si2 = rhs->singleton_int()) { - if (*mb_si == *mb_si2) { - return lhs->singleton_coeff() >= rhs->singleton_coeff(); - } - TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate"); - } - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - if (rhs->constant_int() && *rhs->constant_int() <= 2) { - return true; - } - TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate"); - } else if (rhs->singleton_int()) { - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) - if (lhs->constant_int() && *lhs->constant_int() < 2) { - return false; - } - TORCH_CHECK(false, "Singleton int ", op, ": Relation is indeterminate"); - } - TORCH_INTERNAL_ASSERT(false, "expect at least one singleton"); -} -} // namespace - -c10::SymNode SingletonSymNodeImpl::eq(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - _eq("eq", this, other.get()))); -} - -c10::SymNode SingletonSymNodeImpl::ne(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - !_eq("ne", this, other.get()))); -} - -c10::SymNode SingletonSymNodeImpl::ge(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - _ge("ge", this, other.get()))); -} - -c10::SymNode SingletonSymNodeImpl::gt(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - !_ge("gt", other.get(), this))); -} - -c10::SymNode SingletonSymNodeImpl::lt(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - !_ge("lt", this, other.get()))); -} - -c10::SymNode SingletonSymNodeImpl::le(const c10::SymNode& other) { - return SymNode(c10::make_intrusive>( - _ge("le", other.get(), this))); -} - -c10::SymNode SingletonSymNodeImpl::mul(const c10::SymNode& other) { - if (auto mb_si = other->singleton_int()) { - TORCH_CHECK(false, "Singleton int cannot be multiplied by singleton int"); - } - c10::optional c = other->constant_int(); - TORCH_CHECK(c.has_value()); - return SymNode(c10::make_intrusive(val_, coeff_ * *c)); -} - -} // namespace c10 diff --git a/aten/src/ATen/core/SingletonSymNodeImpl.h b/aten/src/ATen/core/SingletonSymNodeImpl.h deleted file mode 100644 index 5c4c9720f8487..0000000000000 --- a/aten/src/ATen/core/SingletonSymNodeImpl.h +++ /dev/null @@ -1,182 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include -#include - -namespace c10 { - -// The motivating usecase for this is to represent the ragged size structure -// of a jagged tensor [B, [s_0, s_1, s_2], D] as a single integer j0. This -// allows us to simply return [B, j0, D] if someone queries for the size of our -// tensor. -// -// Morally we define comparison between two singleton ints to return true if -// that comparison holds for all corresponding elements of the arrays they -// represent. Comparison between a singleton int and a plain int is defined -// similarly. -// -// To simulate this desired behavior but also avoid the O(N) cost of checking, -// we associate each raggedness pattern with an integer "id" that can be used as -// a proxy to evaluate equality. We also constrain the range of values for this -// as to enable inequality checks. -// -// We also support a positive integer scalar "coeff" that is used for computing -// strides. For example given, a [B, j0, D] tensor, it can be strided in two -// different ways: [D * j0, D, 1] and [j0, 1, sum(j0)]. The coeff is used to -// differentiate the two cases. -// -// During tracing the strides of the outputs need to be a function of the size -// and strides of the inputs so it is important that SingletonSymNode itself is -// able to express this. -class TORCH_API SingletonSymNodeImpl : public SymNodeImpl { - public: - // CAUTION: you should probably not be constructing these directly; please - // the higher-level API in python instead (TODO: actually introduce that). - explicit SingletonSymNodeImpl(int64_t val, int64_t coeff) - : val_(val), coeff_(coeff) {} - - bool bool_() override { - return false; - } - - bool is_int() override { - return true; - } - - bool is_float() override { - return false; - } - - bool is_bool() override { - return false; - } - - bool has_hint() override { - return true; - } - - c10::SymNode wrap_int(int64_t num) override { - return SymNode(c10::make_intrusive>(num)); - }; - - int64_t guard_int(const char* file, int64_t line) override { - TORCH_CHECK(false); - } - - double guard_float(const char* file, int64_t line) override { - TORCH_CHECK(false, "not a float"); - } - - bool guard_bool(const char* file, int64_t line) override { - TORCH_CHECK(false, "not a bool"); - } - - int64_t int_() override { - TORCH_CHECK(false); - } - - std::string str() override { - if (coeff_ == 1) { - return "j" + std::to_string(val_); - } - return std::to_string(coeff_) + "*j" + std::to_string(val_); - } - - // NOTE [ Inequalities with SingletonInt ] - // - // The semantics of SingletonInt when it comes to relations is that it is - // treated as integer known to be within a certain range, - // - // j0 \in [2, int64_t::max] - // - // allowing us to answer queries like j0 >= 1 (True), and j0 == 0 (False). - // This is a useful default range for the raggedness pattern of a jagged - // tensor (1) since sizes are non-negative, and (2) we need to get past 0/1 - // specialization checks. - // - // [ Indeterminate inequalities error out ] - // - // Given the semantic defined above, certain relations like j0 < 3 are thus - // indeterminable. In our impl today, evaluating such relations error - // - // It may seem convenient to just define indeterminate relations to return - // False, but the implementation we maintain in parallel using sympy does not - // allow this. - // - // Sympy only allows overriding of Ge. The other relations (Lt, Gt, Le) are, - // by consequence, all derived from Ge e.g., Lt(a, b) := !Ge(a, b). This - // would mean that means that if we define the indeterminate j0 >= 3 to be - // False, the also indeterminate j0 < 3 will be evaluated to be True! - // - // [ Coefficient are assumed positive ] - // - // For the purpose of computing inequalities, we consider the coefficient of - // the SingletonInt to be a positive integer. - // - // Thus, no modifications are needed to the logic since - // j0 >= k implies coeff * j0 >= k - // - c10::SymNode eq(const c10::SymNode& other) override; - c10::SymNode ne(const c10::SymNode& other) override; - c10::SymNode ge(const c10::SymNode& other) override; - c10::SymNode gt(const c10::SymNode& other) override; - c10::SymNode lt(const c10::SymNode& other) override; - c10::SymNode le(const c10::SymNode& other) override; - c10::SymNode mul(const c10::SymNode& other) override; - - c10::optional singleton_int() override { - return val_; - } - - c10::optional singleton_coeff() override { - return coeff_; - } - - bool is_symbolic() override { - return false; - } - -#define DEFINE_BINARY_NOT_SUPPORTED(name) \ - c10::SymNode name(const c10::SymNode& other) override { \ - TORCH_CHECK(false, #name " not supported by SingletonSymNode"); \ - } - - DEFINE_BINARY_NOT_SUPPORTED(add) - DEFINE_BINARY_NOT_SUPPORTED(sub) - DEFINE_BINARY_NOT_SUPPORTED(truediv) - DEFINE_BINARY_NOT_SUPPORTED(pow) - DEFINE_BINARY_NOT_SUPPORTED(floordiv) - DEFINE_BINARY_NOT_SUPPORTED(mod) - DEFINE_BINARY_NOT_SUPPORTED(sym_min) - DEFINE_BINARY_NOT_SUPPORTED(sym_max) - DEFINE_BINARY_NOT_SUPPORTED(sym_and) - DEFINE_BINARY_NOT_SUPPORTED(sym_or) - -#undef DEFINE_BINARY_NOT_SUPPORTED - -#define DEFINE_NOT_SUPPORTED(name) \ - c10::SymNode name() override { \ - TORCH_CHECK(false, #name " is not supported by SingletonSymNode"); \ - } - - DEFINE_NOT_SUPPORTED(sym_not) - DEFINE_NOT_SUPPORTED(ceil) - DEFINE_NOT_SUPPORTED(floor) - DEFINE_NOT_SUPPORTED(neg) - DEFINE_NOT_SUPPORTED(clone) - DEFINE_NOT_SUPPORTED(sym_float) - -#undef DEFINE_NOT_SUPPORTED - - private: - int64_t val_; - int64_t coeff_; -}; - -} // namespace c10 diff --git a/aten/src/ATen/core/Tensor.cpp b/aten/src/ATen/core/Tensor.cpp index 92befbf481f24..ed19144d0eaff 100644 --- a/aten/src/ATen/core/Tensor.cpp +++ b/aten/src/ATen/core/Tensor.cpp @@ -72,9 +72,9 @@ void TensorBase::enforce_invariants() { void TensorBase::print() const { if (defined()) { - std::cerr << "[" << toString() << " " << sizes() << "]" << std::endl; + std::cerr << "[" << toString() << " " << sizes() << "]" << '\n'; } else { - std::cerr << "[UndefinedTensor]" << std::endl; + std::cerr << "[UndefinedTensor]" << '\n'; } } diff --git a/aten/src/ATen/core/Tensor.h b/aten/src/ATen/core/Tensor.h index d20ab49d42783..8172cf31e7522 100644 --- a/aten/src/ATen/core/Tensor.h +++ b/aten/src/ATen/core/Tensor.h @@ -68,6 +68,7 @@ class TORCH_API TensorRef { }; template +// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t { // Return the grad argument in case of a hook with void return type to have an // std::function with Tensor return type @@ -81,6 +82,7 @@ auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_void_t { } template +// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) auto Tensor::register_hook(T&& hook) const -> Tensor::hook_return_var_t { return _register_hook([fn=std::forward(hook)](const TensorBase& grad_base) { TensorRef grad(grad_base); diff --git a/aten/src/ATen/core/TensorAccessor.h b/aten/src/ATen/core/TensorAccessor.h index 7440376d0e976..a1a4e0972d3ac 100644 --- a/aten/src/ATen/core/TensorAccessor.h +++ b/aten/src/ATen/core/TensorAccessor.h @@ -7,6 +7,7 @@ #include #include #include +#include namespace at { @@ -131,7 +132,7 @@ class GenericPackedTensorAccessorBase { } // if index_t is not int64_t, we want to have an int64_t constructor - template ::value>::type> + template >> // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) C10_HOST GenericPackedTensorAccessorBase( PtrType data_, @@ -184,7 +185,7 @@ class GenericPackedTensorAccessor : public GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} // if index_t is not int64_t, we want to have an int64_t constructor - template ::value>::type> + template >> C10_HOST GenericPackedTensorAccessor( PtrType data_, const source_index_t* sizes_, @@ -231,7 +232,7 @@ class GenericPackedTensorAccessor : public GenericPackedT : GenericPackedTensorAccessorBase(data_, sizes_, strides_) {} // if index_t is not int64_t, we want to have an int64_t constructor - template ::value>::type> + template >> C10_HOST GenericPackedTensorAccessor( PtrType data_, const source_index_t* sizes_, diff --git a/aten/src/ATen/core/TensorBase.h b/aten/src/ATen/core/TensorBase.h index b82e6b25e1d80..e03c6bdf2bd10 100644 --- a/aten/src/ATen/core/TensorBase.h +++ b/aten/src/ATen/core/TensorBase.h @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -27,11 +28,11 @@ namespace c10 { class Scalar; } -namespace torch { namespace autograd { +namespace torch::autograd { struct Node; -}} // namespace torch::autograd +} // namespace torch::autograd namespace at { @@ -415,7 +416,7 @@ class TORCH_API TensorBase { } /// Returns a `Tensor`'s device index. - int64_t get_device() const { + DeviceIndex get_device() const { // NB: this is not a native function to avoid dispatching overhead. return impl_->get_device(); } @@ -506,10 +507,10 @@ class TORCH_API TensorBase { return impl_->is_mps(); } - /// Returns if a `Tensor` is ort tensor. - bool is_ort() const { + /// Returns if a `Tensor` is maia tensor. + bool is_maia() const { // NB: this is not a native function to avoid dispatching overhead. - return impl_->is_ort(); + return impl_->is_maia(); } /// Returns if a `Tensor` is vulkan tensor. @@ -593,9 +594,12 @@ class TORCH_API TensorBase { return mutable_data_ptr(); } - template + template , int> = 0> const T* const_data_ptr() const; + template , int> = 0> + const std::remove_const_t* const_data_ptr() const; + template T* mutable_data_ptr() const; @@ -620,7 +624,13 @@ class TORCH_API TensorBase { TensorAccessor accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr()"); TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim()); - return TensorAccessor(data_ptr(),sizes().data(),strides().data()); + T* ptr = nullptr; + if constexpr (std::is_const::value) { + ptr = const_data_ptr(); + } else { + ptr = mutable_data_ptr(); + } + return TensorAccessor(ptr,sizes().data(),strides().data()); } template TensorAccessor accessor() && = delete; @@ -634,7 +644,13 @@ class TORCH_API TensorBase { GenericPackedTensorAccessor generic_packed_accessor() const& { static_assert(N > 0, "accessor is used for indexing tensor, for scalars use *data_ptr()"); TORCH_CHECK(dim() == N, "TensorAccessor expected ", N, " dims but tensor has ", dim()); - return GenericPackedTensorAccessor(static_cast::PtrType>(data_ptr()),sizes().data(),strides().data()); + T* ptr = nullptr; + if constexpr (std::is_const::value) { + ptr = const_data_ptr(); + } else { + ptr = mutable_data_ptr(); + } + return GenericPackedTensorAccessor(static_cast::PtrType>(ptr),sizes().data(),strides().data()); } template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> GenericPackedTensorAccessor generic_packed_accessor() && = delete; @@ -815,9 +831,9 @@ class TORCH_API TensorBase { //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template - using hook_return_void_t = std::enable_if_t>::value, unsigned>; + using hook_return_void_t = std::enable_if_t>, unsigned>; template - using hook_return_var_t = std::enable_if_t, TensorBase>::value, unsigned>; + using hook_return_var_t = std::enable_if_t, TensorBase>, unsigned>; /// Registers a backward hook. /// @@ -904,15 +920,16 @@ class TORCH_API TensorBase { TensorBase __dispatch_contiguous(c10::MemoryFormat) const; }; -inline int64_t get_device(const TensorBase& self) { +inline DeviceIndex get_device(const TensorBase& self) { return self.get_device(); } template +// NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) auto TensorBase::register_hook(T&& hook) const -> TensorBase::hook_return_void_t { // Return the grad argument in case of a hook with void return type to have an // std::function with Tensor return type - static_assert(std::is_same::value, + static_assert(std::is_same_v, "Expected hook to return void"); return _register_hook([fn=std::forward(hook)](const TensorBase& grad) { fn(grad); @@ -1010,9 +1027,9 @@ inline c10::MaybeOwned TensorBase::expect_contiguous(MemoryFormat me namespace symint { template -using enable_if_symint = std::enable_if_t::value>; +using enable_if_symint = std::enable_if_t>; template -using enable_if_int = std::enable_if_t::value>; +using enable_if_int = std::enable_if_t>; template > c10::SymIntArrayRef sizes(const TensorBase& t) { return t.sym_sizes(); } diff --git a/aten/src/ATen/core/TorchDispatchUtils.cpp b/aten/src/ATen/core/TorchDispatchUtils.cpp index e2f981c6a8332..8f666e5a476ab 100644 --- a/aten/src/ATen/core/TorchDispatchUtils.cpp +++ b/aten/src/ATen/core/TorchDispatchUtils.cpp @@ -1,7 +1,7 @@ #include -namespace at { -namespace impl { + +namespace at::impl { bool tensor_has_dispatch(const at::Tensor& t) { DispatchKeySet key_set({DispatchKey::Python, DispatchKey::PythonTLSSnapshot}); @@ -27,5 +27,4 @@ bool tensorlist_has_dispatch(const c10::List>& li) { return false; } -} // namespace impl -} // namespace at +} // namespace at::impl diff --git a/aten/src/ATen/core/TorchDispatchUtils.h b/aten/src/ATen/core/TorchDispatchUtils.h index a55074812b612..0ead779360097 100644 --- a/aten/src/ATen/core/TorchDispatchUtils.h +++ b/aten/src/ATen/core/TorchDispatchUtils.h @@ -6,12 +6,11 @@ #include #include -namespace at { -namespace impl { +namespace at::impl { TORCH_API bool tensor_has_dispatch(const at::Tensor& t); TORCH_API bool tensorlist_has_dispatch(at::ITensorListRef li); TORCH_API bool tensorlist_has_dispatch(const c10::List>& li); using c10::impl::dispatch_mode_enabled; -}} +} diff --git a/aten/src/ATen/core/TransformationHelper.h b/aten/src/ATen/core/TransformationHelper.h index 1061a732ddb7e..f81018a8e674f 100644 --- a/aten/src/ATen/core/TransformationHelper.h +++ b/aten/src/ATen/core/TransformationHelper.h @@ -1,11 +1,13 @@ +#include #include #include #include #include -#include -#include +#include #include #include +#include +#include namespace at { @@ -54,12 +56,12 @@ C10_HOST_DEVICE inline T uniform_int_full_range(V val) { * in this overloaded version */ template -C10_HOST_DEVICE inline typename std::enable_if::value), T>::type uniform_int(V val) { +C10_HOST_DEVICE inline std::enable_if_t), T>uniform_int(V val) { if constexpr (std::is_same_v) { return static_cast(val & 1); } else if constexpr (std::is_same_v) { return static_cast(val % (static_cast(std::numeric_limits::max()) + 1)); - } else if constexpr (std::is_same_v || std::is_same::value) { + } else if constexpr (std::is_same_v || std::is_same_v) { return static_cast(val % static_cast((1ULL << std::numeric_limits::digits) + 1)); } else if constexpr (std::is_integral_v) { return static_cast(val % (static_cast(std::numeric_limits::max()) + 1)); @@ -74,7 +76,7 @@ C10_HOST_DEVICE inline typename std::enable_if::valu * added to fix compiler warnings reported in GitHub issue 46391. T is either float or double in this version. */ template -C10_HOST_DEVICE inline typename std::enable_if::value, T>::type uniform_int(V val) { +C10_HOST_DEVICE inline std::enable_if_t, T>uniform_int(V val) { return static_cast(val % static_cast((1ULL << std::numeric_limits::digits) + 1)); } diff --git a/aten/src/ATen/core/VariableHooksInterface.cpp b/aten/src/ATen/core/VariableHooksInterface.cpp index 7525584e0d7d0..a062582c2101f 100644 --- a/aten/src/ATen/core/VariableHooksInterface.cpp +++ b/aten/src/ATen/core/VariableHooksInterface.cpp @@ -1,6 +1,6 @@ #include -namespace at { namespace impl { +namespace at::impl { namespace { VariableHooksInterface* hooks = nullptr; @@ -17,4 +17,4 @@ bool HasVariableHooks() { return hooks != nullptr; } -}} // namespace at::impl +} // namespace at::impl diff --git a/aten/src/ATen/core/VariableHooksInterface.h b/aten/src/ATen/core/VariableHooksInterface.h index f75342dea76a9..47d74f5433ac2 100644 --- a/aten/src/ATen/core/VariableHooksInterface.h +++ b/aten/src/ATen/core/VariableHooksInterface.h @@ -1,7 +1,7 @@ #pragma once -#include #include +#include // A little explanation about why this file exists at all. We have // a few methods on Tensor class which require access to reified access to @@ -29,20 +29,20 @@ // have weird signatures that are not supported by autograd, and (2) // see this bug https://github.com/pytorch/pytorch/issues/30102 -namespace torch { namespace autograd { +namespace torch::autograd { struct Node; -}} // namespace torch::autograd +} // namespace torch::autograd -namespace at { -namespace impl { +namespace at::impl { struct TORCH_API VariableHooksInterface { virtual ~VariableHooksInterface() = default; virtual TensorBase tensor_data(const TensorBase&) const = 0; virtual TensorBase variable_data(const TensorBase&) const = 0; - virtual const std::shared_ptr& grad_fn(const TensorBase&) const = 0; + virtual const std::shared_ptr& grad_fn( + const TensorBase&) const = 0; virtual unsigned _register_hook( const TensorBase&, std::function hook) const = 0; @@ -57,9 +57,17 @@ struct TORCH_API VariableHooksInterface { virtual int64_t _version(const TensorBase&) const = 0; virtual void retain_grad(const TensorBase&) const = 0; virtual bool retains_grad(const TensorBase&) const = 0; - virtual void _backward(const Tensor&, TensorList, const c10::optional&, c10::optional, bool) const = 0; + virtual void _backward( + const Tensor&, + TensorList, + const c10::optional&, + c10::optional, + bool) const = 0; virtual void requires_grad_(const TensorBase&, bool) const = 0; - virtual void basic_autograd_not_implemented_fallback(const c10::OperatorHandle& op, c10::DispatchKeySet dispatch_keys, torch::jit::Stack* stack) const = 0; + virtual void basic_autograd_not_implemented_fallback( + const c10::OperatorHandle& op, + c10::DispatchKeySet dispatch_keys, + torch::jit::Stack* stack) const = 0; }; TORCH_API void SetVariableHooks(VariableHooksInterface* hooks); @@ -72,4 +80,4 @@ struct TORCH_API VariableHooksRegisterer { } }; -}} // namespace at::impl +} // namespace at::impl diff --git a/aten/src/ATen/core/Variadic.h b/aten/src/ATen/core/Variadic.h index 61b6a35a0b1cb..da4df1b1b1a66 100644 --- a/aten/src/ATen/core/Variadic.h +++ b/aten/src/ATen/core/Variadic.h @@ -1,8 +1,5 @@ #pragma once -#include -#include -#include #include #include diff --git a/aten/src/ATen/core/Vitals.cpp b/aten/src/ATen/core/Vitals.cpp index 6746540f43e12..a854be6756bfa 100644 --- a/aten/src/ATen/core/Vitals.cpp +++ b/aten/src/ATen/core/Vitals.cpp @@ -2,8 +2,7 @@ #include #include -namespace at { -namespace vitals { +namespace at::vitals { APIVitals VitalsAPI; @@ -78,8 +77,7 @@ bool APIVitals::setVital( auto iter = name_map_.find(vital_name); TorchVital* vital = nullptr; if (iter == name_map_.end()) { - auto r = - name_map_.emplace(vital_name, TorchVital(vital_name)); + auto r = name_map_.emplace(vital_name, TorchVital(vital_name)); vital = &r.first->second; } else { vital = &iter->second; @@ -95,5 +93,4 @@ APIVitals::APIVitals() : vitals_enabled(false), name_map_() { setVital("CUDA", "used", "False", /* force = */ true); } -} // namespace vitals -} // namespace at +} // namespace at::vitals diff --git a/aten/src/ATen/core/Vitals.h b/aten/src/ATen/core/Vitals.h index 2d4fe1cc0995c..8a7a51e81e1d2 100644 --- a/aten/src/ATen/core/Vitals.h +++ b/aten/src/ATen/core/Vitals.h @@ -1,15 +1,11 @@ #pragma once -#include -#include -#include #include #include #include #include -namespace at { -namespace vitals { +namespace at::vitals { TORCH_API bool torchVitalEnabled(); @@ -82,8 +78,7 @@ class TORCH_API APIVitals { extern TORCH_API APIVitals VitalsAPI; -} // namespace vitals -} // namespace at +} // namespace at::vitals #define TORCH_VITAL_DECLARE(name) \ TORCH_API at::vitals::TorchVital TorchVital_##name; diff --git a/aten/src/ATen/core/adaption.cpp b/aten/src/ATen/core/adaption.cpp index 0c2976ab09219..ef06b9606ba7e 100644 --- a/aten/src/ATen/core/adaption.cpp +++ b/aten/src/ATen/core/adaption.cpp @@ -1,15 +1,13 @@ #include -namespace c10 { -namespace impl { + +namespace c10::impl { void common_device_check_failure(Device common_device, const at::Tensor& tensor, at::CheckedFrom methodName, at::CheckedFrom argName) { TORCH_CHECK(false, "Expected all tensors to be on the same device, but " - // NOLINTNEXTLINE(bugprone-unchecked-optional-access) "found at least two devices, ", common_device, " and ", tensor.device(), "! " "(when checking argument for argument ", argName, " in method ", methodName, ")"); } -} // namespace impl -} // namespace c10 +} // namespace c10::impl diff --git a/aten/src/ATen/core/blob.h b/aten/src/ATen/core/blob.h index 3e8da3e4e7a67..35ee3b358c991 100644 --- a/aten/src/ATen/core/blob.h +++ b/aten/src/ATen/core/blob.h @@ -1,10 +1,6 @@ #pragma once -#include -#include #include -#include -#include #include #include @@ -26,7 +22,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target { /** * Initializes an empty Blob. */ - Blob() noexcept : meta_(), pointer_(nullptr), has_ownership_(false) {} + Blob() noexcept : meta_() {} ~Blob() override { Reset(); } @@ -148,11 +144,11 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target { * call is made or the blob is destructed. */ template - typename std::remove_const::type* ShareExternal( - typename std::remove_const::type* allocated) { + std::remove_const_t* ShareExternal( + std::remove_const_t* allocated) { return static_cast(ShareExternal( static_cast(allocated), - TypeMeta::Make::type>())); + TypeMeta::Make>())); } void* ShareExternal(void* allocated, const TypeMeta meta) { @@ -176,7 +172,7 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target { /** * @brief Swaps the underlying storage of two blobs. */ - void swap(Blob& rhs) { + void swap(Blob& rhs) noexcept { using std::swap; swap(meta_, rhs.meta_); swap(pointer_, rhs.pointer_); @@ -191,13 +187,13 @@ class TORCH_API Blob final : public c10::intrusive_ptr_target { } TypeMeta meta_; - void* pointer_; - bool has_ownership_; + void* pointer_{nullptr}; + bool has_ownership_{false}; C10_DISABLE_COPY_AND_ASSIGN(Blob); }; -inline void swap(Blob& lhs, Blob& rhs) { +inline void swap(Blob& lhs, Blob& rhs) noexcept { lhs.swap(rhs); } diff --git a/aten/src/ATen/core/boxing/KernelFunction.h b/aten/src/ATen/core/boxing/KernelFunction.h index d8d0a3d151407..c950f4c80ffc7 100644 --- a/aten/src/ATen/core/boxing/KernelFunction.h +++ b/aten/src/ATen/core/boxing/KernelFunction.h @@ -6,6 +6,7 @@ #include #include #include +#include namespace c10 { @@ -17,7 +18,7 @@ class KernelFunction; template using has_symint = - guts::disjunction< + std::disjunction< std::is_same, std::is_same, std::is_same, diff --git a/aten/src/ATen/core/boxing/KernelFunction_impl.h b/aten/src/ATen/core/boxing/KernelFunction_impl.h index 8ef5315fbc7cc..0d6149c8090a9 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_impl.h +++ b/aten/src/ATen/core/boxing/KernelFunction_impl.h @@ -3,6 +3,9 @@ #include #include +#include +#include + namespace c10 { inline KernelFunction::KernelFunction() @@ -83,8 +86,7 @@ C10_ALWAYS_INLINE Return KernelFunction::call(const OperatorHandle& opHandle, Di // forwarding, which would require Args to be deduced, but instead we // want callers to explicitly specify the Args. - // This should get inlined by compiler - if (guts::disjunction...>::value) { + if constexpr (std::disjunction_v...>) { if (sym_unboxed_kernel_func_ != nullptr) { auto *functor = boxed_kernel_func_.getFunctor(); return callUnboxedKernelFunction( diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h index efc68570924ef..82fdd824ea65b 100644 --- a/aten/src/ATen/core/boxing/impl/boxing.h +++ b/aten/src/ATen/core/boxing/impl/boxing.h @@ -10,6 +10,7 @@ #include #include +#include namespace c10 { namespace impl { @@ -38,7 +39,15 @@ template struct has_ivalue_to : std::false_type {}; template -struct has_ivalue_to().to())>> +struct ivalue_to_helper +{ + using type = decltype(std::declval().template to()); +}; +template +using ivalue_to_helper_t = typename ivalue_to_helper::type; + +template +struct has_ivalue_to>> : std::true_type {}; @@ -49,7 +58,7 @@ struct has_ivalue_to().to())>> // A boxable arg type is one that IValue has a constructor for. template using can_box = - guts::disjunction< + std::disjunction< std::is_constructible>, // TensorOptions are not directly constructible into IValue, // but torch::jit::push knows how to handle them @@ -57,18 +66,18 @@ using can_box = >; template -using can_box_all = guts::conjunction...>; +using can_box_all = std::conjunction...>; // an unboxable result is one that can be extracted from an IValue template using can_unbox = - guts::conjunction< - guts::disjunction< + std::conjunction< + std::disjunction< has_ivalue_to, // void returns are ok std::is_same >, - guts::negation> + std::negation> >; // diff --git a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h index 5308499edd439..ccd94ff1de2be 100644 --- a/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h +++ b/aten/src/ATen/core/boxing/impl/make_boxed_from_unboxed_functor.h @@ -183,7 +183,7 @@ namespace impl { struct assert_is_valid_input_type::value>> { // There is no reason to support float when we have double. Keep the API lean. static_assert(guts::false_t::value, - "You tried to register a kernel with an unsupported input type: float. Please use double instead."); + "You tried to register a kernel with an unsupported input type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); }; template struct assert_is_valid_input_type::value>> { @@ -198,7 +198,7 @@ namespace impl { template struct assert_is_valid_input_type::value && !guts::typelist::contains::value>> { static_assert(guts::false_t::value, - "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead."); + "You tried to register a kernel with an unsupported integral input type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); }; template struct assert_is_valid_input_type::value>> { @@ -283,7 +283,7 @@ namespace impl { struct assert_is_valid_output_type::value>> { // There is no reason to support float when we have double. Keep the API lean. static_assert(guts::false_t::value, - "You tried to register a kernel with an unsupported output type: float. Please use double instead."); + "You tried to register a kernel with an unsupported output type: float. Please use double instead; you should use `double` in the C++ function signature and `float` in the schema string."); }; template struct assert_is_valid_output_type::value>> { @@ -298,7 +298,7 @@ namespace impl { template struct assert_is_valid_output_type::value && !guts::typelist::contains::value>> { static_assert(guts::false_t::value, - "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead."); + "You tried to register a kernel with an unsupported integral output type. Please use int64_t instead; you should use `int64_t` in the C++ function signature and `int` in the schema string."); }; // ivalue_to_arg diff --git a/aten/src/ATen/core/builtin_function.h b/aten/src/ATen/core/builtin_function.h index 887e57b157ed5..b25ca55c16851 100644 --- a/aten/src/ATen/core/builtin_function.h +++ b/aten/src/ATen/core/builtin_function.h @@ -7,8 +7,7 @@ #include #include -namespace torch { -namespace jit { +namespace torch::jit { struct BuiltinOpFunction : public Function { BuiltinOpFunction( @@ -62,12 +61,16 @@ struct BuiltinOpFunction : public Function { return *this; } - bool call(Stack& stack, c10::optional, c10::function_ref) override { + bool call( + Stack& stack, + c10::optional, + c10::function_ref) override { run(stack); return false; } - bool call(Stack& stack, c10::function_ref) override { + bool call(Stack& stack, c10::function_ref) + override { run(stack); return false; } @@ -84,5 +87,4 @@ struct BuiltinOpFunction : public Function { std::string doc_string_; }; -} // namespace jit -} // namespace torch +} // namespace torch::jit diff --git a/aten/src/ATen/core/class_type.h b/aten/src/ATen/core/class_type.h index 310ed332aec77..99fd27bba5426 100644 --- a/aten/src/ATen/core/class_type.h +++ b/aten/src/ATen/core/class_type.h @@ -6,12 +6,12 @@ #include #include -namespace torch { -namespace jit { + +namespace torch::jit { struct CompilationUnit; struct Function; -} // namespace jit -} // namespace torch +} // namespace torch::jit + namespace c10 { @@ -390,7 +390,7 @@ struct TORCH_API ClassType : public NamedType { std::string doc_string = "", std::vector unresolved_class_attributes = {}); - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { const auto& n = name().value(); return n.qualifiedName(); } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.cpp b/aten/src/ATen/core/dispatch/Dispatcher.cpp index 66b199de3cd18..6077ac8e34cc8 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.cpp +++ b/aten/src/ATen/core/dispatch/Dispatcher.cpp @@ -266,24 +266,25 @@ void Dispatcher::deregisterDef_( namespace { -using AbstractImplPyStubsType = std::unordered_map>; -AbstractImplPyStubsType& abstractImplPyStubsSingleton() { - static AbstractImplPyStubsType _data; +// Maps OperatorName to (python module name, description) tuple. +using PythonModuleMapType = std::unordered_map>; +PythonModuleMapType& pythonModulesSingleton() { + static PythonModuleMapType _data; return _data; } } -c10::optional> Dispatcher::getAbstractImplPyStub(OperatorName op_name) { +c10::optional> Dispatcher::getPyStub(OperatorName op_name) { std::lock_guard lock(guard_->mutex); - auto found = abstractImplPyStubsSingleton().find(op_name); - if (found == abstractImplPyStubsSingleton().end()) { + auto found = pythonModulesSingleton().find(op_name); + if (found == pythonModulesSingleton().end()) { return c10::nullopt; } return found->second; } -RegistrationHandleRAII Dispatcher::registerAbstractImplPyStub( +RegistrationHandleRAII Dispatcher::registerPythonModule( const OperatorName& op_name, const char* pymodule, const char* context @@ -292,28 +293,28 @@ RegistrationHandleRAII Dispatcher::registerAbstractImplPyStub( // If there are duplicates, we just let it through and warn about it. // Throwing an error during static initialization causes a crash that // doesn't give any sign of what happened. - auto found = abstractImplPyStubsSingleton().find(op_name); - if (found != abstractImplPyStubsSingleton().end()) { + auto found = pythonModulesSingleton().find(op_name); + if (found != pythonModulesSingleton().end()) { TORCH_WARN( - "Tried to register an abstract impl pystub for ", op_name, " ", + "Tried to register an python registration stub (pystub) for ", op_name, " ", "that specifies the Python module ", pymodule, " " "but there already was a pystub that specifies the Python module ", found->second.first, ". We will override the existing pystub."); } - abstractImplPyStubsSingleton()[op_name] = std::make_pair(pymodule, context); + pythonModulesSingleton()[op_name] = std::make_pair(pymodule, context); return RegistrationHandleRAII([guard = this->guard_, op_name] { std::lock_guard lock(guard->mutex); if (!guard->alive.load()) { return; } - abstractImplPyStubsSingleton().erase(op_name); + pythonModulesSingleton().erase(op_name); }); } -void Dispatcher::throwIfHasAbstractImplPyStub(OperatorName op_name) { +void Dispatcher::throwIfHasPythonModule(OperatorName op_name) { std::lock_guard lock(guard_->mutex); - auto elt = abstractImplPyStubsSingleton().find(op_name); - if (elt == abstractImplPyStubsSingleton().end()) { + auto elt = pythonModulesSingleton().find(op_name); + if (elt == pythonModulesSingleton().end()) { return; } const char* pymodule = elt->second.first; @@ -498,37 +499,51 @@ std::vector Dispatcher::getRegistrationsForDispatchKey(c10::option }); } -int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey) { +int64_t Dispatcher::sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey, DispatchKeySet dispatchKeySet) { int64_t seq_num = -1; // Setting sequence number in the Autograd case to associate // the forward range with the corresponding Autograd's node - if (isIncludedInAlias(dispatchKey, DispatchKey::Autograd) && at::GradMode::is_enabled()) { + + // Note: this records a sequence number for both Autograd keys, and for + // non-Autograd keys where the dispatchKeySet still contains an autograd key. + // This means that we might collect the same sequence nubmer two different + // events if they all occurred above Autograd and still had the Autograd + // dispatch key in the dispatch key set. + // However, this usually doesn't happen: normally the first call will + // go through the call() or callBoxed() path in the dispatcher, while + // subsequent redispatches go through redispatch() or redispatchBoxed(). + // `call` has profiler instrumentation, whereas `redispatch` doesn't. + // So usually, we'll collect a sequence number on the first call() if the + // dispatch keys contain autograd, and not on subsequent redispatches. + bool dispatchHasAutograd = !(dispatchKeySet & autograd_dispatch_keyset).empty(); + + if (dispatchHasAutograd && at::GradMode::is_enabled()) { seq_num = at::sequence_number::peek(); } return seq_num; } -void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, c10::ArrayRef args) { - guard.before(schema_ref, args, sequenceNumberForRunningRecordFunction(dispatchKey)); +void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet, c10::ArrayRef args) { + guard.before(schema_ref, args, sequenceNumberForRunningRecordFunction(dispatchKey, dispatchKeySet)); } -void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey) { +void Dispatcher::runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet) { // Setting sequence number in the Autograd case to associate // the forward range with the corresponding Autograd's node - guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey)); + guard.before(schema_ref, sequenceNumberForRunningRecordFunction(dispatchKey, dispatchKeySet)); } #ifdef FBCODE_CAFFE2 bool Dispatcher::profilingOperatorEvents() { return TORCH_SDT_IS_ENABLED(operator_start) || TORCH_SDT_IS_ENABLED(operator_end); } -void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) { +C10_NOINLINE void Dispatcher::fireOpStartUSDT(at::RecordFunction::schema_ref_t schema_ref) { if (TORCH_SDT_IS_ENABLED(operator_start)) { TORCH_SDT_WITH_SEMAPHORE(operator_start, schema_ref.get().name().c_str()); } } -void Dispatcher::fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref) { +C10_NOINLINE void Dispatcher::fireOpEndUSDT(at::RecordFunction::schema_ref_t schema_ref) { if (TORCH_SDT_IS_ENABLED(operator_end)) { TORCH_SDT_WITH_SEMAPHORE(operator_end, schema_ref.get().name().c_str()); } diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index d383ee95569a2..caf73d7cebb21 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -224,17 +224,17 @@ class TORCH_API Dispatcher final { RegistrationHandleRAII registerImpl(OperatorName op_name, c10::optional dispatch_key, KernelFunction kernel, c10::optional cpp_signature, std::unique_ptr inferred_function_schema, std::string debug); /** - * Given an operator, tells the Dispatcher that we have implemented an abstract impl + * Given an operator, tells the Dispatcher that we have implemented a fake impl * for this op in the given Python module. Call this a "pystub". */ - RegistrationHandleRAII registerAbstractImplPyStub(const OperatorName& op_name, const char* pymodule, const char* context); + RegistrationHandleRAII registerPythonModule(const OperatorName& op_name, const char* pymodule, const char* context); /** - * Given an operator, throws if we have an abstract impl pystub. + * Given an operator, throws if we have a pystub. */ - void throwIfHasAbstractImplPyStub(OperatorName op_name); + void throwIfHasPythonModule(OperatorName op_name); - c10::optional> getAbstractImplPyStub(OperatorName op_name); + c10::optional> getPyStub(OperatorName op_name); /** * Register a new operator by name. @@ -304,9 +304,9 @@ class TORCH_API Dispatcher final { private: Dispatcher(); - static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey); - static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey); - static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, c10::ArrayRef args); + static int64_t sequenceNumberForRunningRecordFunction(DispatchKey dispatchKey, DispatchKeySet dispatchKeySet); + static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet); + static void runRecordFunction(at::RecordFunction& guard, at::RecordFunction::schema_ref_t schema_ref, DispatchKey dispatchKey, DispatchKeySet dispatchKeySet, c10::ArrayRef args); #ifdef FBCODE_CAFFE2 static bool profilingOperatorEvents(); @@ -403,6 +403,10 @@ class TORCH_API OperatorHandle { return operatorDef_->op.hasKernelForDispatchKey(k); } + bool isKernelFallthroughKernel(DispatchKey k) const { + return operatorDef_->op.kernelForDispatchKey(k).isFallthrough(); + } + bool hasKernelForAnyDispatchKey(DispatchKeySet k) const { return operatorDef_->op.hasKernelForAnyDispatchKey(k); } @@ -630,15 +634,15 @@ inline Return Dispatcher::callWithDispatchKeySlowPath(const TypedOperatorHandle< TORCH_INTERNAL_ASSERT_DEBUG_ONLY(lastArgIdx == num_boxed_args); // I don't *think* we need std::launder here, because IValue has // no subclasses and no const or reference fields. - runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef(reinterpret_cast(boxedArgs), num_boxed_args)); + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet, c10::ArrayRef(reinterpret_cast(boxedArgs), num_boxed_args)); for (size_t ii = 0; ii < num_boxed_args; ++ii) { reinterpret_cast(&boxedArgs[ii])->~IValue(); } } else { - runRecordFunction(guard, schema_ref, dispatchKey); + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); } } else { - runRecordFunction(guard, schema_ref, dispatchKey); + runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); } if (C10_UNLIKELY(guard.needsOutputs())) { @@ -732,8 +736,8 @@ inline void Dispatcher::callBoxed(const OperatorHandle& op, Stack* stack) const auto dispatchKey = dispatchKeySet.highestPriorityTypeId(); auto& schema = op.schema(); auto schema_ref = std::reference_wrapper(schema); - guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, c10::ArrayRef(stack->data(), stack->size())) - : runRecordFunction(guard, schema_ref, dispatchKey); + guard.needsInputs() ? runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet, c10::ArrayRef(stack->data(), stack->size())) + : runRecordFunction(guard, schema_ref, dispatchKey, dispatchKeySet); // keeping the guard alive while executing the kernel kernel.callBoxed(op, dispatchKeySet, stack); diff --git a/aten/src/ATen/core/dispatch/OperatorEntry.cpp b/aten/src/ATen/core/dispatch/OperatorEntry.cpp index 627109c516daf..5f4538f2c9790 100644 --- a/aten/src/ATen/core/dispatch/OperatorEntry.cpp +++ b/aten/src/ATen/core/dispatch/OperatorEntry.cpp @@ -421,7 +421,7 @@ void OperatorEntry::updateDispatchTable_(const c10::Dispatcher& dispatcher, Disp // In theory, we should only have to check if the given runtime key has "dense" functionality, // e.g. DispatchKey::CPU (which is composed of DispatchKey::Dense and BackendComponent::CPUBit). // However, there are some backends that should be included in this set that don't have the dense key set. - // E.g. DispatchKey::Meta, DispatchKey::ORT. + // E.g. DispatchKey::Meta, DispatchKey::MAIA. if (c10::isBackendDispatchKey(dispatch_key)) { DispatchKey autograd_key = getAutogradKeyFromBackend(toBackendComponent(dispatch_key)); updateDispatchTableEntry_(dispatcher, autograd_key); diff --git a/aten/src/ATen/core/dynamic_type.h b/aten/src/ATen/core/dynamic_type.h index ecfb6a999bc2e..25b75b9e51114 100644 --- a/aten/src/ATen/core/dynamic_type.h +++ b/aten/src/ATen/core/dynamic_type.h @@ -187,7 +187,7 @@ class DynamicType : public SharedType { bool equals(const DynamicType& other) const; template - bool compareArguments(const DynamicType& other, F&& f) const { + bool compareArguments(const DynamicType& other, const F& f) const { if (arguments_.elems.size() != other.arguments_.elems.size()) { return false; } diff --git a/aten/src/ATen/core/enum_type.h b/aten/src/ATen/core/enum_type.h index bd60c1e77224f..136fe59e22fb5 100644 --- a/aten/src/ATen/core/enum_type.h +++ b/aten/src/ATen/core/enum_type.h @@ -88,7 +88,7 @@ struct TORCH_API EnumType : public NamedType { cu_(std::move(cu)) {} std::string annotation_str_impl( - C10_UNUSED TypePrinter printer = nullptr) const override { + C10_UNUSED const TypePrinter& printer = nullptr) const override { const auto& n = name().value(); return n.qualifiedName(); } diff --git a/aten/src/ATen/core/function.h b/aten/src/ATen/core/function.h index b4f82712a57d0..f55e15e50b4fa 100644 --- a/aten/src/ATen/core/function.h +++ b/aten/src/ATen/core/function.h @@ -14,8 +14,7 @@ namespace at { TORCH_API void launch(std::function func); } -namespace torch { -namespace jit { +namespace torch::jit { struct Graph; struct Code; @@ -29,7 +28,9 @@ using Kwargs = std::unordered_map; struct RecursiveMethodCallError : public std::exception {}; using TaskLauncher = std::function)>; -TORCH_API void preoptimizeGraph(std::shared_ptr& graph, bool disable_autocast=false); +TORCH_API void preoptimizeGraph( + std::shared_ptr& graph, + bool disable_autocast = false); // A Function is a pure Graph with no implicit `self` object bound. // It contains schema information and the executor that manages the @@ -54,14 +55,13 @@ struct TORCH_API Function { virtual c10::intrusive_ptr runAsync( Stack& /*stack*/, + // NOLINTNEXTLINE(performance-unnecessary-value-param) C10_UNUSED TaskLauncher taskLauncher = at::launch) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return {}; } - at::IValue operator()( - Stack stack, - const Kwargs& kwargs = Kwargs()) { + at::IValue operator()(Stack stack, const Kwargs& kwargs = Kwargs()) { getSchema().checkAndNormalizeInputs(stack, kwargs); run(stack); return stack.front(); @@ -93,8 +93,12 @@ struct TORCH_API Function { // If call() returns true, then callback completes successfully, otherwise // call() returns false. - // Overload for server interpreter, a bailout size is needed for graph executor. - virtual bool call(Stack&, c10::optional, c10::function_ref) { + // Overload for server interpreter, a bailout size is needed for graph + // executor. + virtual bool call( + Stack&, + c10::optional, + c10::function_ref) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(false); return false; } @@ -107,5 +111,4 @@ struct TORCH_API Function { virtual ~Function() = default; }; -} // namespace jit -} // namespace torch +} // namespace torch::jit diff --git a/aten/src/ATen/core/function_schema.h b/aten/src/ATen/core/function_schema.h index b3818784561f4..79e7ffed1a14f 100644 --- a/aten/src/ATen/core/function_schema.h +++ b/aten/src/ATen/core/function_schema.h @@ -143,10 +143,10 @@ struct Argument { inferred_type_hint); } - Argument cloneWithType(TypePtr new_type) const { + Argument cloneWithType(const TypePtr& new_type) const { return Argument( name_, - std::move(new_type), + new_type, N_, default_value_, kwarg_only_, diff --git a/aten/src/ATen/core/functional.h b/aten/src/ATen/core/functional.h index 6b4f3447f5d48..1ddc674182010 100644 --- a/aten/src/ATen/core/functional.h +++ b/aten/src/ATen/core/functional.h @@ -9,7 +9,7 @@ namespace c10 { // const reference (const T&); taking T by non-const reference // will result in an error like: // -// error: no type named 'type' in 'class std::result_of' +// error: no type named 'type' in 'class std::invoke_result' // // No explicit template parameters are required. diff --git a/aten/src/ATen/core/interned_strings.h b/aten/src/ATen/core/interned_strings.h index 15857eec9df06..4f6abd66cb887 100644 --- a/aten/src/ATen/core/interned_strings.h +++ b/aten/src/ATen/core/interned_strings.h @@ -1,9 +1,4 @@ #pragma once -#include -#include -#include -#include -#include #include @@ -232,6 +227,7 @@ namespace c10 { _(aten, is_autocast_enabled) \ _(aten, is_autocast_cpu_enabled) \ _(aten, is_autocast_xla_enabled) \ + _(aten, get_autocast_dtype) \ FORALL_ATEN_BASE_SYMBOLS(_) \ _(onnx, Add) \ _(onnx, Concat) \ diff --git a/aten/src/ATen/core/interned_strings_class.h b/aten/src/ATen/core/interned_strings_class.h index 6e57332b99f97..a215fa62c7e91 100644 --- a/aten/src/ATen/core/interned_strings_class.h +++ b/aten/src/ATen/core/interned_strings_class.h @@ -1,5 +1,3 @@ -#include -#include #include #include #include diff --git a/aten/src/ATen/core/ivalue.cpp b/aten/src/ATen/core/ivalue.cpp index 4ce4b9c42774c..7343d66fcb97d 100644 --- a/aten/src/ATen/core/ivalue.cpp +++ b/aten/src/ATen/core/ivalue.cpp @@ -258,7 +258,6 @@ void IValue::getSubValues(HashAliasedIValues& subValues) const { case Tag::Capsule: TORCH_CHECK_TYPE( false, "Cannot inspect value of type ", this->tagKind()); - [[fallthrough]]; default: // don't record scalars. break; diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 57812446d0764..07e85677c3c75 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -10,11 +10,9 @@ #include #include #include -#include #include #include #include -#include #include #include #include @@ -492,9 +490,7 @@ struct TORCH_API IValue final { // Custom C++ classes template < typename T, - std::enable_if_t< - std::is_base_of::value, - int> = 0> + std::enable_if_t, int> = 0> IValue(intrusive_ptr custom_class); bool isCustomClass() const; template @@ -508,17 +504,17 @@ struct TORCH_API IValue final { template < typename... Args, std::enable_if_t< - !std::disjunction< + !std::disjunction_v< std::is_lvalue_reference..., - std::negation>...>::value, + std::negation>...>, std::nullptr_t> = nullptr> IValue(const std::tuple& t); template < typename... Args, std::enable_if_t< - !std::disjunction< + !std::disjunction_v< std::is_lvalue_reference..., - std::negation>...>::value, + std::negation>...>, std::nullptr_t> = nullptr> IValue(std::tuple&& t); bool isTuple() const { @@ -536,8 +532,13 @@ struct TORCH_API IValue final { return Tag::Double == tag; } double toDouble() const { - AT_ASSERT(isDouble()); - return payload.u.as_double; + if (isDouble()) { + return payload.u.as_double; + } else if (isSymFloat()) { + return toSymFloat().guard_float(__FILE__, __LINE__); + } else { + TORCH_INTERNAL_ASSERT(0, "expected double"); + } } // ComplexDouble @@ -643,8 +644,13 @@ struct TORCH_API IValue final { } int64_t toInt() const { - AT_ASSERT(isInt()); - return payload.u.as_int; + if (isInt()) { + return payload.u.as_int; + } else if (isSymInt()) { + return toSymInt().guard_int(__FILE__, __LINE__); + } else { + TORCH_INTERNAL_ASSERT(0, "expected int"); + } } // Bool @@ -662,8 +668,13 @@ struct TORCH_API IValue final { return Tag::Bool == tag; } bool toBool() const { - AT_ASSERT(isBool()); - return payload.u.as_bool; + if (isBool()) { + return payload.u.as_bool; + } else if (isSymBool()) { + return toSymBool().guard_bool(__FILE__, __LINE__); + } else { + TORCH_INTERNAL_ASSERT(0, "expected bool"); + } } // IntList @@ -732,7 +743,7 @@ struct TORCH_API IValue final { // This SFINAEs the called constructor exists. template using enable_if_ivalue_constructible = - std::enable_if_t::value, std::nullptr_t>; + std::enable_if_t, std::nullptr_t>; // The rule for lists is more complicated; the generic constructor is only // acceptable if your element isn't SymInt. If you do have a SymInt element, @@ -744,8 +755,7 @@ struct TORCH_API IValue final { // they're not selectable. template using enable_if_list_is_ivalue_constructible = std::enable_if_t< - std::is_constructible::value && - !std::is_same::value, + std::is_constructible_v && !std::is_same_v, std::nullptr_t>; template = nullptr> @@ -756,6 +766,8 @@ struct TORCH_API IValue final { IValue(at::ArrayRef v); template = nullptr> IValue(const std::vector& v); + template = nullptr> + IValue(std::vector&& v); template IValue(std::array v); @@ -764,7 +776,7 @@ struct TORCH_API IValue final { // to prevent implicit conversions template using enable_if_symint = - std::enable_if_t::value, std::nullptr_t>; + std::enable_if_t, std::nullptr_t>; template = nullptr> IValue(at::ArrayRef v); @@ -772,13 +784,14 @@ struct TORCH_API IValue final { IValue(at::OptionalArrayRef v); template = nullptr> IValue(const std::vector& v); + template = nullptr> + IValue(std::vector&& v); template using enable_if_ilist_is_ivalue_constructible = std::enable_if_t< - std::is_constructible::value && - std::is_constructible::boxed_type>:: - value && - !std::is_same::value, + std::is_constructible_v && + std::is_constructible_v::boxed_type> && + !std::is_same_v, std::nullptr_t>; template = nullptr> @@ -839,7 +852,7 @@ struct TORCH_API IValue final { c10::intrusive_ptr toEnumHolder() const&; // None - IValue() : tag(Tag::None) {} + IValue() = default; bool isNone() const { return Tag::None == tag; } @@ -932,21 +945,20 @@ struct TORCH_API IValue final { // ScalarType IValue(ScalarType t) - : IValue(static_cast::type>(t)) {} + : IValue(static_cast>(t)) {} at::ScalarType toScalarType() const { return static_cast(toInt()); } // Layout - IValue(Layout l) - : IValue(static_cast::type>(l)) {} + IValue(Layout l) : IValue(static_cast>(l)) {} at::Layout toLayout() const { return static_cast(toInt()); } // MemoryFormat IValue(MemoryFormat m) - : IValue(static_cast::type>(m)) {} + : IValue(static_cast>(m)) {} at::MemoryFormat toMemoryFormat() const { return static_cast(toInt()); } @@ -1171,6 +1183,7 @@ struct TORCH_API IValue final { } } + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) C10_ALWAYS_INLINE void moveFrom(IValue&& rhs) noexcept { if (rhs.isTensor()) { new (&payload.as_tensor) at::Tensor(std::move(rhs.payload.as_tensor)); diff --git a/aten/src/ATen/core/ivalue_inl.h b/aten/src/ATen/core/ivalue_inl.h index d59d33219d82e..3e3525c274118 100644 --- a/aten/src/ATen/core/ivalue_inl.h +++ b/aten/src/ATen/core/ivalue_inl.h @@ -361,10 +361,10 @@ struct TORCH_API TupleElements { switch (inlineSize_) { case 3: new (&elementsInline_[2]) IValue(elements[2]); - C10_FALLTHROUGH; + [[fallthrough]]; case 2: new (&elementsInline_[1]) IValue(elements[1]); - C10_FALLTHROUGH; + [[fallthrough]]; case 1: new (&elementsInline_[0]) IValue(elements[0]); break; @@ -1034,11 +1034,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { */ template void addCallback(T callback, bool uses_future = true) { -#if __cpp_lib_is_invocable >= 201703 static_assert( std::is_invocable_r::value, "The callback must have signature void(Future&)"); -#endif std::unique_lock lock(mutex_); if (completed()) { @@ -1057,14 +1055,13 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { template c10::intrusive_ptr then(T callback, TypePtr type) { using IValueWithStorages = std::tuple>; -#if __cpp_lib_is_invocable >= 201703 static_assert( std::disjunction< std::is_invocable_r, std::is_invocable_r>::value, "The callback must have signature IValue(Future&) or " "std::tuple>(Future&)"); -#endif + auto childFut = createInstance(::std::move(type)); addCallback([childFut, cb = std::move(callback)](Future& parentFut) mutable { @@ -1084,11 +1081,10 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { template c10::intrusive_ptr thenAsync(T callback, TypePtr type) { -#if __cpp_lib_is_invocable >= 201703 static_assert( std::is_invocable_r, T, Future&>::value, "The callback must have signature c10::intrusive_ptr(Future&)"); -#endif + auto childFut = createInstance(std::move(type)); addCallback( [childFut, cb = std::move(callback)](Future& parentFut) mutable { @@ -1165,11 +1161,9 @@ struct C10_EXPORT ivalue::Future final : c10::intrusive_ptr_target { // synchronize them with the value, and so on (if needed). template void invokeCallback(T callback, bool uses_future) { -#if __cpp_lib_is_invocable >= 201703 static_assert( std::is_invocable_r::value, "The callback must have signature void(Future&)"); -#endif // The synchronization performed below shouldn't be needed when the future // is not used by the callback. @@ -1675,8 +1669,8 @@ struct _guarded_unsigned_long_unique_dummy final { _guarded_unsigned_long_unique_dummy(int64_t){}; }; using _guarded_unsigned_long = std::conditional_t< - std::is_same::value || - std::is_same::value, + std::is_same_v || + std::is_same_v, _guarded_unsigned_long_unique_dummy, unsigned long>; @@ -1936,9 +1930,9 @@ template < typename... Args, typename Indices = std::make_index_sequence, std::enable_if_t< - !std::disjunction< + !std::disjunction_v< std::is_lvalue_reference..., - std::negation>...>::value, + std::negation>...>, std::nullptr_t> = nullptr> std::tuple generic_to(const IValue& ivalue, _fake_type>) { const auto& vals = ivalue.toTupleRef().elements(); @@ -2116,9 +2110,9 @@ inline IValue::IValue(c10::intrusive_ptr v) template < typename... Args, std::enable_if_t< - !std::disjunction< + !std::disjunction_v< std::is_lvalue_reference..., - std::negation>...>::value, + std::negation>...>, std::nullptr_t>> inline IValue::IValue(const std::tuple& t) : IValue(c10::guts::apply(c10::ivalue::Tuple::create, t)) { @@ -2127,9 +2121,9 @@ inline IValue::IValue(const std::tuple& t) template < typename... Args, std::enable_if_t< - !std::disjunction< + !std::disjunction_v< std::is_lvalue_reference..., - std::negation>...>::value, + std::negation>...>, std::nullptr_t>> inline IValue::IValue(std::tuple&& t) : IValue(c10::guts::apply(c10::ivalue::Tuple::create, std::move(t))) { @@ -2185,6 +2179,23 @@ template > inline IValue::IValue(const std::vector& v) : IValue() { *this = IValue(at::ArrayRef(v)); } +template > +inline IValue::IValue(std::vector&& v) : IValue() { + auto vi = c10::asIntArrayRefSlowOpt(v); + if (vi.has_value()) { + // This list is entirely integers; ensure it is typed as + // an IntList so toIntList works + *this = IValue(*vi); + } else { + // This list has SymInts; type it as a SymInt + *this = IValue(impl::toList(c10::List())); + auto list = to>(); + list.reserve(v.size()); + for (auto&& e : std::move(v)) { + list.push_back(std::move(e)); + } + } +} template > inline IValue::IValue(const std::vector& v) : IValue(c10::List()) { auto list = to>(); @@ -2193,6 +2204,22 @@ inline IValue::IValue(const std::vector& v) : IValue(c10::List()) { list.push_back(e); } } + +template > +inline IValue::IValue(std::vector&& v) : IValue(c10::List()) { + auto list = to>(); + list.reserve(v.size()); + if constexpr (std::is_same_v) { + for (auto e : v) { + list.push_back(e); + } + } else { + for (auto&& e : std::move(v)) { + list.push_back(std::move(e)); + } + } +} + template > inline IValue::IValue(c10::OptionalArrayRef v) : IValue() { if (v.has_value()) { @@ -2280,7 +2307,7 @@ inline IValue IValue::make_capsule( template < typename T, - std::enable_if_t::value, int>> + std::enable_if_t, int>> IValue::IValue(c10::intrusive_ptr custom_class) : tag(Tag::Object) { auto classType = []() { try { @@ -2288,8 +2315,7 @@ IValue::IValue(c10::intrusive_ptr custom_class) : tag(Tag::Object) { } catch (const c10::Error&) { throw c10::Error( "Trying to instantiate a class that isn't a registered custom class: " + - std::string(c10::util::get_fully_qualified_type_name()), - ""); + std::string(c10::util::get_fully_qualified_type_name())); } }(); auto ivalue_obj = c10::ivalue::Object::create(std::move(classType), /* numSlots */1); diff --git a/aten/src/ATen/core/jit_type.h b/aten/src/ATen/core/jit_type.h index 3f0d7970a10f9..05f7242855417 100644 --- a/aten/src/ATen/core/jit_type.h +++ b/aten/src/ATen/core/jit_type.h @@ -17,14 +17,13 @@ #include #include #include -#include #include -namespace torch { -namespace jit { + +namespace torch::jit { struct Function; -} // namespace jit -} // namespace torch +} // namespace torch::jit + namespace c10 { @@ -171,9 +170,9 @@ struct TORCH_API UnionType : public SharedType { protected: explicit UnionType(std::vector types, TypeKind kind=TypeKind::UnionType); - std::string annotation_str_impl(TypePrinter printer = nullptr) const override; + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override; std::string unionStr( - TypePrinter printer = nullptr, + const TypePrinter& printer = nullptr, bool is_annotation_str = false) const; // NOLINTNEXTLINE(cppcoreguidelines-non-private-member-variables-in-classes) bool has_free_variables_; @@ -240,9 +239,9 @@ struct TORCH_API OptionalType : public UnionType { TypePtr contained_; - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "Optional[" << getElementType()->annotation_str(std::move(printer)) << "]"; + ss << "Optional[" << getElementType()->annotation_str(printer) << "]"; return ss.str(); } }; @@ -546,6 +545,7 @@ struct VaryingShape { return c10::nullopt; } std::vector sizes; + sizes.reserve(dims_.value().size()); for (auto d : *dims_) { if (!d) { return c10::nullopt; @@ -909,9 +909,9 @@ struct TORCH_API ListType private: ListType(TypePtr elem) : SingleElementType(std::move(elem)) {} - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "List[" << getElementType()->annotation_str(std::move(printer)) << "]"; + ss << "List[" << getElementType()->annotation_str(printer) << "]"; return ss.str(); } }; @@ -1003,7 +1003,7 @@ struct TORCH_API DictType : public SharedType { types.push_back(std::move(value)); } - std::string annotation_str_impl(TypePrinter printer = nullptr) const override; + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override; std::vector types; bool has_free_variables; @@ -1044,9 +1044,9 @@ struct TORCH_API FutureType private: FutureType(TypePtr elem) : SingleElementType(std::move(elem)) {} - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "Future[" << getElementType()->annotation_str(std::move(printer)) << "]"; + ss << "Future[" << getElementType()->annotation_str(printer) << "]"; return ss.str(); } }; @@ -1086,7 +1086,7 @@ struct TORCH_API AwaitType private: AwaitType(TypePtr elem) : SingleElementType(std::move(elem)) {} - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; ss << "Await[" << getElementType()->annotation_str(printer) << "]"; return ss.str(); @@ -1118,9 +1118,9 @@ struct TORCH_API RRefType private: RRefType(TypePtr elem) : SingleElementType(std::move(elem)) {} - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { std::stringstream ss; - ss << "RRef[" << getElementType()->annotation_str(std::move(printer)) << "]"; + ss << "RRef[" << getElementType()->annotation_str(printer) << "]"; return ss.str(); } }; @@ -1225,7 +1225,7 @@ struct TORCH_API TupleType : public NamedType { return true; } - std::string annotation_str_impl(TypePrinter printer = nullptr) const override; + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override; std::vector elements_; bool has_free_variables_; @@ -1278,7 +1278,7 @@ struct TORCH_API NumberType : public Type { protected: NumberType(TypeKind kind = TypeKind::NumberType) : Type(kind) {} - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "number"; // technically not a valid python type, but // we need to use it when parsing back in annotations // for implicit conversions @@ -1305,7 +1305,7 @@ struct TORCH_API FloatType : public NumberType { private: FloatType() : NumberType(TypeKind::FloatType) {} - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "float"; } }; @@ -1330,7 +1330,7 @@ struct TORCH_API ComplexType : public NumberType { private: ComplexType() : NumberType(TypeKind::ComplexType) {} - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "complex"; } }; @@ -1348,7 +1348,7 @@ struct TORCH_API SymIntType : public Type { std::string str() const override { return "SymInt"; } - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { return "int"; } static const TypeKind Kind = TypeKind::SymIntType; @@ -1368,7 +1368,7 @@ struct TORCH_API SymFloatType : public Type { std::string str() const override { return "SymFloat"; } - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { return "float"; } static const TypeKind Kind = TypeKind::SymFloatType; @@ -1388,7 +1388,7 @@ struct TORCH_API SymBoolType : public Type { std::string str() const override { return "SymBool"; } - std::string annotation_str_impl(TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(const TypePrinter& printer = nullptr) const override { return "bool"; } static const TypeKind Kind = TypeKind::SymBoolType; @@ -1419,7 +1419,7 @@ struct TORCH_API IntType : public NumberType { private: IntType() : NumberType(TypeKind::IntType) {} - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "int"; } }; @@ -1453,7 +1453,7 @@ struct TORCH_API StringType : public Type { // we only use "str" (not "string") in both FunctionSchema and script return annotation_str(); } - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "str"; } static const TypeKind Kind = TypeKind::StringType; @@ -1473,7 +1473,7 @@ struct TORCH_API StorageType : public Type { std::string str() const override { return annotation_str(); } - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return "Storage"; } static const TypeKind Kind = TypeKind::StorageType; @@ -1508,7 +1508,7 @@ struct TORCH_API FunctionType : public NamedType { private: FunctionType(torch::jit::Function* function); - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { const auto& n = name().value(); return n.qualifiedName(); } @@ -2199,7 +2199,7 @@ struct TORCH_API InterfaceType : public NamedType { const InterfaceType& rhs, std::ostream* why_not); - std::string annotation_str_impl(C10_UNUSED TypePrinter printer = nullptr) const override { + std::string annotation_str_impl(C10_UNUSED const TypePrinter& printer = nullptr) const override { return name()->qualifiedName(); } diff --git a/aten/src/ATen/core/jit_type_base.h b/aten/src/ATen/core/jit_type_base.h index bf4909aaefec7..21692db56dd87 100644 --- a/aten/src/ATen/core/jit_type_base.h +++ b/aten/src/ATen/core/jit_type_base.h @@ -118,7 +118,7 @@ struct CastReturnType { }; template -struct CastReturnType::value>::type> { +struct CastReturnType::value>> { using type = SingletonTypePtr; }; @@ -128,7 +128,7 @@ struct CastConstReturnType { }; template -struct CastConstReturnType::value>::type> { +struct CastConstReturnType::value>> { using type = SingletonTypePtr; }; @@ -156,7 +156,7 @@ struct TORCH_API Type { Type(Type&&) noexcept = default; Type& operator=(Type&&) noexcept = default; - virtual std::string annotation_str_impl(TypePrinter /*printer*/) const { + virtual std::string annotation_str_impl(const TypePrinter& /*printer*/) const { return str(); } // a == b @@ -177,7 +177,7 @@ struct TORCH_API Type { /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr x) : repr_(std::move(x)) {} - template ::value, bool> = true> + template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(std::shared_ptr x) : repr_(std::move(x)) {} @@ -187,7 +187,7 @@ struct TORCH_API Type { /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr p) : repr_(p) {} - template ::value, bool> = true> + template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(SingletonTypePtr p) : repr_(SingletonTypePtr(p.get())) {} @@ -205,10 +205,10 @@ struct TORCH_API Type { // Case 3: Otherwise, T is not a SharedType. (debug-check this // assumption!) Use a singleton pointer. - template ::value, bool> = true> + template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) : SingletonOrSharedTypePtr(static_cast::type>(p)->shared_from_this()) {} - template ::value, bool> = true> + template , bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) { if (auto* shared_p = dynamic_cast::type>(p)) { repr_ = Repr(shared_p->shared_from_this()); @@ -217,7 +217,7 @@ struct TORCH_API Type { } } - template ::value && !std::is_base_of::value, bool> = true> + template && !std::is_base_of_v, bool> = true> /* implicit */ SingletonOrSharedTypePtr(T* p) : repr_(p) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(dynamic_cast::type>(p) == nullptr); @@ -244,7 +244,7 @@ struct TORCH_API Type { return repr_.isNonNull(); } - template , void>::value, bool> = true> + template , void>, bool> = true> U& operator*() const { return *get(); } @@ -409,37 +409,37 @@ struct TORCH_API Type { // Compatibility shims to accommodate existing code that passes shared_ptrs // around. Ideally, we would just delete this, but it should be harmless. template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOf(const std::shared_ptr& rhs) const { return isSubtypeOf(*rhs); } template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOf(const SingletonOrSharedTypePtr& rhs) const { return isSubtypeOf(*rhs); } template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOf(SingletonTypePtr rhs) const { return isSubtypeOf(*rhs); } template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOfExt(const SingletonOrSharedTypePtr& rhs, std::ostream* why_not) const { return isSubtypeOfExt(*rhs, why_not); } template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOfExt(const std::shared_ptr& rhs, std::ostream* why_not) const { return isSubtypeOfExt(*rhs, why_not); } template - typename std::enable_if::value, bool>::type + std::enable_if_t, bool> isSubtypeOfExt(SingletonTypePtr rhs, std::ostream* why_not) const { return isSubtypeOfExt(*rhs, why_not); } @@ -453,14 +453,14 @@ struct TORCH_API Type { // // Takes a custom printer that users can pass in to customize the output of // this method. - std::string annotation_str(TypePrinter printer) const { + std::string annotation_str(const TypePrinter& printer) const { if (printer) { // the printer can return nullopt to fall through to the default impl if (auto renamed = printer(*this)) { return *renamed; } } - return annotation_str_impl(std::move(printer)); + return annotation_str_impl(printer); } std::string annotation_str() const { // Overload instead of define a default value for `printer` to help @@ -583,6 +583,7 @@ struct TORCH_API Type { // per-type constructor, you only need to override this if the // containedTypes() is not empty virtual TypePtr createWithContained( + // NOLINTNEXTLINE(performance-unnecessary-value-param) std::vector /*contained_types*/) const { AT_ERROR( "type with contained types did not overload createWithContained: ", diff --git a/aten/src/ATen/core/library.cpp b/aten/src/ATen/core/library.cpp index 1341ee0c8b8d3..fd349da2f8b0c 100644 --- a/aten/src/ATen/core/library.cpp +++ b/aten/src/ATen/core/library.cpp @@ -51,6 +51,10 @@ CppFunction::CppFunction(c10::KernelFunction func, c10::optional k, const char* file, uint32_t line) @@ -129,12 +133,12 @@ Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name } switch (rv) { case _RegisterOrVerify::REGISTER: - if (impl_abstract_pystub_.has_value()) { + if (python_module_.has_value()) { registrars_.emplace_back( - c10::Dispatcher::singleton().registerAbstractImplPyStub( + c10::Dispatcher::singleton().registerPythonModule( schema.operator_name(), - impl_abstract_pystub_->first, - impl_abstract_pystub_->second) + python_module_->first, + python_module_->second) ); } registrars_.emplace_back( @@ -153,6 +157,7 @@ Library& Library::_def(c10::FunctionSchema&& schema, c10::OperatorName* out_name } #undef DEF_PRELUDE +// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) Library& Library::_def(std::variant&& name_or_schema, CppFunction&& f, const std::vector& tags) & { c10::FunctionSchema schema = [&] { if (std::holds_alternative(name_or_schema)){ @@ -214,6 +219,7 @@ at::OperatorName Library::_parseNameForLib(const char* name_str) const { return name; } +// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) Library& Library::_impl(const char* name_str, CppFunction&& f, _RegisterOrVerify rv) & { at::OperatorName name = _parseNameForLib(name_str); // See Note [Redundancy in registration code is OK] @@ -253,6 +259,7 @@ c10::OperatorName Library::_resolve(const char* name_str) const { } #undef IMPL_PRELUDE +// NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) Library& Library::_fallback(CppFunction&& f) & { TORCH_CHECK(kind_ == IMPL, "fallback(...): Cannot define an operator inside of a ", toString(kind_), " block. " @@ -275,8 +282,8 @@ Library& Library::_fallback(CppFunction&& f) & { registrars_.emplace_back( c10::Dispatcher::singleton().registerFallback( k, - std::move(f.func_), - debugString(std::move(f.debug_), file_, line_) + f.func_, + debugString(f.debug_, file_, line_) ) ); } diff --git a/aten/src/ATen/core/op_registration/README.md b/aten/src/ATen/core/op_registration/README.md index 5605e962a6e5e..61b41b48c4a67 100644 --- a/aten/src/ATen/core/op_registration/README.md +++ b/aten/src/ATen/core/op_registration/README.md @@ -13,13 +13,13 @@ Thereā€™s four main use cases * Youā€™re writing a new operator that isnā€™t supposed to be part of the public PyTorch API. * Youā€™re writing a new operator but donā€™t want to change the core pytorch code base, say youā€™re developing a shared library with operators. * Youā€™re writing a C++ extension for PyTorch or youā€™re using inline c++ in your .py model files. -* Youā€™re writing a backend library like XLA or ORT that adds new kernels to all operators defined in `native_functions.yaml`. +* Youā€™re writing a backend library like XLA or MAIA that adds new kernels to all operators defined in `native_functions.yaml`. For these use cases, the custom operator API is the better solution. ### What is the price for using the custom operator API instead of `native_functions.yaml`? -If youā€™re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/ORT example above), then youā€™re fine and donā€™t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats. +If youā€™re just using the custom operator API to add new kernels for existing operators (e.g. the XLA/MAIA example above), then youā€™re fine and donā€™t pay any price. If, however, you define a new operator purely using the custom op API, i.e. your operator never shows up in `native_functions.yaml`, then you need to be aware of a few caveats. * It will not get a C++ API generated. There will not be `Tensor::your_op()` methods or `at::your_op()` functions to call your operator. * The API for calling the operator from Python looks a little bit different. It needs to be called through `torch.ops.your_op()` instead of `torch._C`. diff --git a/aten/src/ATen/core/op_registration/infer_schema.h b/aten/src/ATen/core/op_registration/infer_schema.h index a00ef76f460b9..57409442950f2 100644 --- a/aten/src/ATen/core/op_registration/infer_schema.h +++ b/aten/src/ATen/core/op_registration/infer_schema.h @@ -6,7 +6,6 @@ */ #include -#include #include namespace c10 { @@ -37,10 +36,10 @@ template constexpr int checkStaticTypes() { // Give nice error messages for some of the common error cases. // Use a LOUD ERROR MESSAGE SO USERS SEE THE STATIC_ASSERT - static_assert(guts::conjunction< + static_assert(std::conjunction< bool_t::value || std::is_same::value || std::is_same::value || std::is_same::value>... >::value, "INVALID TYPE: Only int8_t, int64_t and bool are supported as an integral argument type"); - static_assert(guts::conjunction< + static_assert(std::conjunction< bool_t::value>... >::value, "INVALID TYPE: float is not supported as an argument type, use double instead"); return 0; diff --git a/aten/src/ATen/core/op_registration/op_registration_test.cpp b/aten/src/ATen/core/op_registration/op_registration_test.cpp index a1c9c63052f1d..377cb403cdcfd 100644 --- a/aten/src/ATen/core/op_registration/op_registration_test.cpp +++ b/aten/src/ATen/core/op_registration/op_registration_test.cpp @@ -1154,15 +1154,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { "(int[]? a) -> int[]?"); // Test list of optional (with empty list) - testArgTypes>>::test( - c10::List>(c10::List>({})), [] (const c10::List>& v) {EXPECT_EQ(0, v.size());}, - c10::List>(c10::List>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to>>().size());}, + testArgTypes>>::test( + c10::List<::std::optional>(c10::List<::std::optional>({})), [] (const c10::List<::std::optional>& v) {EXPECT_EQ(0, v.size());}, + c10::List<::std::optional>(c10::List<::std::optional>({})), [] (const IValue& v) {EXPECT_EQ(0, v.to>>().size());}, "(int?[] a) -> int?[]"); // Test list of optional (with values) - testArgTypes>>::test( - c10::List>(c10::List>({3, c10::nullopt, 2})), [] (const c10::List>& v) {expectListEquals>({3, c10::nullopt, 2}, v);}, - c10::List>(c10::List>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals>({3, c10::nullopt, 2}, v.to>>());}, + testArgTypes>>::test( + c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const c10::List<::std::optional>& v) {expectListEquals>({3, c10::nullopt, 2}, v);}, + c10::List<::std::optional>(c10::List<::std::optional>({3, c10::nullopt, 2})), [] (const IValue& v) {expectListEquals>({3, c10::nullopt, 2}, v.to>>());}, "(int?[] a) -> int?[]"); // dict types @@ -1234,15 +1234,15 @@ TEST(OperatorRegistrationTest, testAvailableArgTypes) { "(Dict(int, Tensor) a) -> Dict(int, Tensor)"); // weird deeply nested type - using DeeplyNestedType = c10::List>>>>; + using DeeplyNestedType = c10::List>>>>; auto makeDeeplyNestedObject = [] () -> DeeplyNestedType { c10::Dict inner3; inner3.insert(1, "1"); - c10::List>> inner2; + c10::List<::std::optional>> inner2; inner2.push_back(std::move(inner3)); - c10::Dict>>> inner1; + c10::Dict>>> inner1; inner1.insert("key", std::move(inner2)); - c10::List>>>> result; + c10::List>>>> result; result.push_back(inner1); return result; }; diff --git a/aten/src/ATen/core/rref_interface.h b/aten/src/ATen/core/rref_interface.h index cefb29c08ddc6..f0749d368792f 100644 --- a/aten/src/ATen/core/rref_interface.h +++ b/aten/src/ATen/core/rref_interface.h @@ -1,7 +1,7 @@ #pragma once #include -#include +#include namespace c10 { diff --git a/aten/src/ATen/core/stack.h b/aten/src/ATen/core/stack.h index 1695e5995ab69..5dc89da6c5627 100644 --- a/aten/src/ATen/core/stack.h +++ b/aten/src/ATen/core/stack.h @@ -8,8 +8,8 @@ // TODO move this to c10 namespace -namespace torch { -namespace jit { + +namespace torch::jit { using c10::IValue; using Stack = std::vector; @@ -22,13 +22,14 @@ class Operation { template ::value, int> = 0> C10_DEPRECATED_MESSAGE("Please use void(Stack&) to register operator instead.") + // NOLINTNEXTLINE(cppcoreguidelines-missing-std-forward) Operation(F&& raw): op_([raw = std::forward(raw)](Stack& stack) { raw(&stack); }) {} template ::value && - !std::is_same, Operation>::value, int> = 0> + !std::is_same_v, Operation>, int> = 0> Operation(F&& op): op_(std::forward(op)) {} Operation(std::nullptr_t) noexcept {} @@ -66,12 +67,14 @@ class Operation { // treat the last N elements of the stack as a list, looking up // element i static inline IValue& peek(Stack& stack, size_t i, size_t N) { + // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions) return *(stack.end() - N + i); } static inline IValue& peek(Stack* stack, size_t i, size_t N) { return peek(*stack, i, N); } static inline const IValue& peek(const Stack& stack, size_t i, size_t N) { + // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions) return *(stack.end() - N + i); } static inline const IValue& peek(const Stack* stack, size_t i, size_t N) { @@ -93,6 +96,7 @@ static inline at::ArrayRef last(const Stack* stack, size_t N) { return last(*stack, N); } static inline void drop(Stack& stack, size_t n) { + // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions) stack.erase(stack.end() - n, stack.end()); } static inline void drop(Stack* stack, size_t n) { @@ -188,6 +192,7 @@ struct TuplePacker { template struct TuplePacker<0, Args...> { + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) static void execute(Stack& /*stack*/, std::tuple&& /*t*/){}; }; @@ -196,5 +201,4 @@ inline void pack(Stack& stack, std::tuple&& t) { TuplePacker::execute(stack, std::move(t)); } -} // namespace jit -} // namespace torch +} // namespace torch::jit diff --git a/aten/src/ATen/core/tensor_type.cpp b/aten/src/ATen/core/tensor_type.cpp index efe2d4cb18703..c7f8c8b05f91e 100644 --- a/aten/src/ATen/core/tensor_type.cpp +++ b/aten/src/ATen/core/tensor_type.cpp @@ -223,9 +223,9 @@ VaryingShape TensorType::computeStrideProps( has_overlap = possible_cross_dimension_overlap(sizes, strides); } } - std::vector stride_properties; - + std::vector stride_properties; + stride_properties.reserve(stride_indices.size()); for (size_t i = 0; i < stride_indices.size(); i++) { bool contiguous_ = tensor_contiguity; if (!contiguous_) { @@ -338,6 +338,7 @@ template struct VaryingShape; template struct VaryingShape; template struct VaryingShape; template struct VaryingShape; +template struct VaryingShape; VaryingShape TensorType::sizes() const { if (!sizes_.rank()) { diff --git a/aten/src/ATen/core/type.cpp b/aten/src/ATen/core/type.cpp index 53c08ae26d5df..f7d67ca84861a 100644 --- a/aten/src/ATen/core/type.cpp +++ b/aten/src/ATen/core/type.cpp @@ -41,7 +41,7 @@ static_assert( sizeof(SingletonOrSharedTypePtr) == sizeof(std::shared_ptr) && sizeof(std::shared_ptr) == 2 * sizeof(void*), "std::shared_ptr has an unexpected representation on this platform!"); static_assert( - std::is_same>()), const TupleTypePtr&>::value, + std::is_same_v>()), const TupleTypePtr&>, "getTypePtr> not returning const ref!"); TypeVerbosity type_verbosity() { @@ -314,9 +314,9 @@ TypePtr DictType::get(const std::string& identifier, TypePtr key, TypePtr value) return containerTypePtrs[map_key]; } -std::string DictType::annotation_str_impl(TypePrinter printer) const { +std::string DictType::annotation_str_impl(const TypePrinter& printer) const { auto keyAnnotation = getKeyType()->annotation_str(printer); - auto valueAnnotation = getValueType()->annotation_str(std::move(printer)); + auto valueAnnotation = getValueType()->annotation_str(printer); std::string result; result.reserve(5 /* "Dict[" */ + keyAnnotation.size() + 2 /* ", " */ + valueAnnotation.size() + 1 /* "]" */); @@ -500,7 +500,7 @@ MatchTypeReturn matchTypeVariables( if (it == type_env.end()) { type_env[vt->name()] = actual; return MatchTypeReturn::Success(); - } else if (auto unified = unifyTypes(it->second, actual)) { + } else if (unifyTypes(it->second, actual)) { // note: unifyTypes allows subtyping in either direction, so actual // may be a supertype of the current binding. we're not responsible // for reporting the error, only for keeping type_env stable @@ -916,7 +916,7 @@ std::string TupleType::str() const { } return ss.str(); } -std::string TupleType::annotation_str_impl(TypePrinter printer) const { +std::string TupleType::annotation_str_impl(const TypePrinter& printer) const { if (schema_ && name()) { return name()->qualifiedName(); } diff --git a/aten/src/ATen/core/type_ptr.h b/aten/src/ATen/core/type_ptr.h index d14c3b8a45641..0859e04c7d2d8 100644 --- a/aten/src/ATen/core/type_ptr.h +++ b/aten/src/ATen/core/type_ptr.h @@ -20,7 +20,7 @@ class SingletonTypePtr { using element_type = typename std::shared_ptr::element_type; - template , void>::value, bool> = true> + template , void>, bool> = true> T& operator*() const { return *repr_; } diff --git a/aten/src/ATen/core/union_type.cpp b/aten/src/ATen/core/union_type.cpp index a35c59584095b..2acc4c497ba56 100644 --- a/aten/src/ATen/core/union_type.cpp +++ b/aten/src/ATen/core/union_type.cpp @@ -359,7 +359,7 @@ bool UnionType::isSubtypeOfExt(const Type& rhs, std::ostream* why_not) const { }); } -std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str) +std::string UnionType::unionStr(const TypePrinter& printer, bool is_annotation_str) const { std::stringstream ss; @@ -399,7 +399,7 @@ std::string UnionType::unionStr(TypePrinter printer, bool is_annotation_str) ss << ", "; } if (is_annotation_str) { - ss << NumberType::get()->annotation_str(std::move(printer)); + ss << NumberType::get()->annotation_str(printer); } else { ss << NumberType::get()->str(); } @@ -412,8 +412,8 @@ std::string UnionType::str() const { return this->unionStr(nullptr, /*is_annotation_str=*/false); } -std::string UnionType::annotation_str_impl(TypePrinter printer) const { - return this->unionStr(std::move(printer), /*is_annotation_str=*/true); +std::string UnionType::annotation_str_impl(const TypePrinter& printer) const { + return this->unionStr(printer, /*is_annotation_str=*/true); } bool UnionType::canHoldType(const Type& type) const { diff --git a/aten/src/ATen/cpp_custom_type_hack.h b/aten/src/ATen/cpp_custom_type_hack.h index 75b900c0d694d..1367ef94df738 100644 --- a/aten/src/ATen/cpp_custom_type_hack.h +++ b/aten/src/ATen/cpp_custom_type_hack.h @@ -57,8 +57,7 @@ #include #endif -namespace at { -namespace cpp_custom_type_hack { +namespace at::cpp_custom_type_hack { template [[deprecated( @@ -108,5 +107,4 @@ create(std::unique_ptr ptr, TensorOptions options) { return retval; } -} // namespace cpp_custom_type_hack -} // namespace at +} // namespace at::cpp_custom_type_hack diff --git a/aten/src/ATen/cpu/vec/functional_base.h b/aten/src/ATen/cpu/vec/functional_base.h index 3b183ad965279..48d44dc42c33c 100644 --- a/aten/src/ATen/cpu/vec/functional_base.h +++ b/aten/src/ATen/cpu/vec/functional_base.h @@ -78,6 +78,35 @@ struct VecReduceAllSIMD { #endif // defined(CPU_CAPABILITY_AVX512) #endif // defined(__GNUC__) && (__GNUC__ > 5) && !defined(_MSC_VER) && !defined(C10_MOBILE) +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) +template +struct VecReduceAllSIMD { + static inline float apply(const Op& vec_fun, const Vectorized& acc_vec) { + using Vec = Vectorized; + Vec v = acc_vec; + + // 128-bit shuffle: [a1, a2, a3, a4, a5, a6, a7, a8] -> [a5, a6, a7, a8, a1, a2, a3, a4] + Vec v1 = {v.get_high(), v.get_low()}; + // [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] ('+' stands for the reduction function. Note that the last 4 elements are not required) + v = vec_fun(v, v1); + + // 64-bit shuffle: [a1+a5, a2+a6, a3+a7, a4+a8, -, -, -, -] -> [a3+a7, a4+a8, a1+a5, a2+a6, -, -, -, -] + float32x4_t v1_1 = vextq_f32(v.get_low(), v.get_low(), 2); + v1 = {v1_1, v1_1}; + // [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] + v = vec_fun(v, v1); + + // 32-bit shuffle: [a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, -, -, -, -] -> [a2+a4+a6+a8, a1+a3+a5+a7, a2+a4+a6+a8, a1+a3+a5+a7, -, -, -, -] + v1_1 = vrev64q_f32(v.get_low()); + v1 = {v1_1, v1_1}; + // [a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, a1+a2+a3+a4+a5+a6+a7+a8, -, -, -, -] + v = vec_fun(v, v1); + + return v.get_low()[0]; + } +}; +#endif // defined(__aarch64__) + template inline scalar_t vec_reduce_all(const Op& vec_fun, const Vectorized& acc_vec) { return VecReduceAllSIMD::apply(vec_fun, acc_vec); diff --git a/aten/src/ATen/cpu/vec/functional_bfloat16.h b/aten/src/ATen/cpu/vec/functional_bfloat16.h index 03cb017549ce9..3bd22b3820f0b 100644 --- a/aten/src/ATen/cpu/vec/functional_bfloat16.h +++ b/aten/src/ATen/cpu/vec/functional_bfloat16.h @@ -45,6 +45,34 @@ inline Vectorized convert_from_float(const Vectorized& a, con return convert_float_half(a, b); } +template , int> = 0> +inline void load_to_float(const scalar_t *data, Vectorized &out1, Vectorized &out2); + +template <> +inline void load_to_float (const BFloat16 *data, Vectorized &out1, Vectorized &out2) { + load_fp32_from_bf16(data, out1, out2); +} + +template <> +inline void load_to_float (const Half *data, Vectorized &out1, Vectorized &out2) { + load_fp32_from_fp16(data, out1, out2); +} + +template , int> = 0> +inline void load_to_float(const scalar_t *data, Vectorized &out); + +template <> +inline void load_to_float (const BFloat16 *data, Vectorized &out) { + load_fp32_from_bf16(data, out); +} + +template <> +inline void load_to_float (const Half *data, Vectorized &out) { + load_fp32_from_fp16(data, out); +} + // Note that we already have specialized member of Vectorized for BFloat16 // so the following functions would run smoothly: // using Vec = Vectorized; @@ -74,8 +102,7 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size > fVec::size()) { data_fvec0 = fVec::set(data_fvec0, vec_fun(data_fvec0, data_fvec1), size - fVec::size()); return vec_reduce_all(vec_fun, data_fvec0, fVec::size()); @@ -85,19 +112,16 @@ inline float reduce_all(const Op& vec_fun, const scalar_t* data, int64_t size) { } int64_t d = bVec::size(); bVec acc_bvec = bVec::loadu(data); - fVec acc_fvec0, acc_fvec1; - std::tie(acc_fvec0, acc_fvec1) = convert_to_float(acc_bvec); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); acc_fvec0 = vec_fun(acc_fvec0, data_fvec0); acc_fvec1 = vec_fun(acc_fvec1, data_fvec1); } if (size - d > 0) { bVec data_bvec = bVec::loadu(data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size - d > fVec::size()) { acc_fvec0 = vec_fun(acc_fvec0, data_fvec0); acc_fvec1 = fVec::set(acc_fvec1, vec_fun(acc_fvec1, data_fvec1), size - d - fVec::size()); @@ -117,8 +141,7 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_f using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size > fVec::size()) { fVec acc1_fvec = fVec::set(data_fvec0, vec_fun1(data_fvec0, data_fvec1), size - fVec::size()); fVec acc2_fvec = fVec::set(data_fvec0, vec_fun2(data_fvec0, data_fvec1), size - fVec::size()); @@ -133,14 +156,11 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_f } int64_t d = bVec::size(); bVec acc_bvec = bVec::loadu(data); - fVec acc1_fvec0, acc1_fvec1; - std::tie(acc1_fvec0, acc1_fvec1) = convert_to_float(acc_bvec); - fVec acc2_fvec0, acc2_fvec1; - std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float(acc_bvec); + auto [acc1_fvec0, acc1_fvec1] = convert_to_float(acc_bvec); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc_bvec); for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0); acc1_fvec1 = vec_fun1(acc1_fvec1, data_fvec1); acc2_fvec0 = vec_fun2(acc2_fvec0, data_fvec0); @@ -148,8 +168,7 @@ inline std::pair reduce2_all(const Op1& vec_fun1, const Op2& vec_f } if (size - d > 0) { bVec data_bvec = bVec::loadu(data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size - d > fVec::size()) { acc1_fvec0 = vec_fun1(acc1_fvec0, data_fvec0); acc1_fvec1 = fVec::set(acc1_fvec1, vec_fun1(acc1_fvec1, data_fvec1), size - d - fVec::size()); @@ -178,8 +197,7 @@ inline float map_reduce_all( using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0); data_fvec1 = map_fun(data_fvec1); @@ -192,14 +210,12 @@ inline float map_reduce_all( } int64_t d = bVec::size(); bVec acc_bvec = bVec::loadu(data); - fVec acc_fvec0, acc_fvec1; - std::tie(acc_fvec0, acc_fvec1) = convert_to_float(acc_bvec); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); acc_fvec0 = map_fun(acc_fvec0); acc_fvec1 = map_fun(acc_fvec1); for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); data_fvec0 = map_fun(data_fvec0); data_fvec1 = map_fun(data_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); @@ -207,8 +223,7 @@ inline float map_reduce_all( } if (size - d > 0) { bVec data_bvec = bVec::loadu(data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); if (size - d > fVec::size()) { data_fvec0 = map_fun(data_fvec0); data_fvec1 = map_fun(data_fvec1); @@ -235,11 +250,9 @@ inline float map2_reduce_all( using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2, size); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1); @@ -252,20 +265,16 @@ inline float map2_reduce_all( } int64_t d = bVec::size(); bVec acc_bvec = bVec::loadu(data); - fVec acc_fvec0, acc_fvec1; - std::tie(acc_fvec0, acc_fvec1) = convert_to_float(acc_bvec); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); bVec acc2_bvec = bVec::loadu(data2); - fVec acc2_fvec0, acc2_fvec1; - std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float(acc2_bvec); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc2_bvec); acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0); acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1); for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2 + d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); data_fvec0 = map_fun(data_fvec0, data2_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); @@ -273,11 +282,9 @@ inline float map2_reduce_all( } if (size - d > 0) { bVec data_bvec = bVec::loadu(data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2 + d, size - d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); if (size - d > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1); @@ -305,14 +312,11 @@ inline float map3_reduce_all( using fVec = vec::Vectorized; if (size < bVec::size()) { bVec data_bvec = bVec::loadu(data, size); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2, size); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(data3, size); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); if (size > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); @@ -325,26 +329,20 @@ inline float map3_reduce_all( } int64_t d = bVec::size(); bVec acc_bvec = bVec::loadu(data); - fVec acc_fvec0, acc_fvec1; - std::tie(acc_fvec0, acc_fvec1) = convert_to_float(acc_bvec); + auto [acc_fvec0, acc_fvec1] = convert_to_float(acc_bvec); bVec acc2_bvec = bVec::loadu(data2); - fVec acc2_fvec0, acc2_fvec1; - std::tie(acc2_fvec0, acc2_fvec1) = convert_to_float(acc2_bvec); + auto [acc2_fvec0, acc2_fvec1] = convert_to_float(acc2_bvec); bVec acc3_bvec = bVec::loadu(data3); - fVec acc3_fvec0, acc3_fvec1; - std::tie(acc3_fvec0, acc3_fvec1) = convert_to_float(acc3_bvec); + auto [acc3_fvec0, acc3_fvec1] = convert_to_float(acc3_bvec); acc_fvec0 = map_fun(acc_fvec0, acc2_fvec0, acc3_fvec0); acc_fvec1 = map_fun(acc_fvec1, acc2_fvec1, acc3_fvec1); for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2 + d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(data3 + d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); acc_fvec0 = red_fun(acc_fvec0, data_fvec0); @@ -352,14 +350,11 @@ inline float map3_reduce_all( } if (size - d > 0) { bVec data_bvec = bVec::loadu(data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(data2 + d, size - d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(data3 + d, size - d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); if (size - d > fVec::size()) { data_fvec0 = map_fun(data_fvec0, data2_fvec0, data3_fvec0); data_fvec1 = map_fun(data_fvec1, data2_fvec1, data3_fvec1); @@ -386,8 +381,7 @@ inline void map( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); fVec output_fvec0 = vec_fun(data_fvec0); fVec output_fvec1 = vec_fun(data_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -395,8 +389,7 @@ inline void map( } if (size - d > 0) { bVec data_bvec = bVec::loadu(input_data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); fVec output_fvec0 = vec_fun(data_fvec0); fVec output_fvec1 = vec_fun(data_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -452,11 +445,9 @@ inline void map2( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_data + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0); fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -464,11 +455,9 @@ inline void map2( } if (size - d > 0) { bVec data_bvec = bVec::loadu(input_data + d, size - d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); fVec output_fvec0 = vec_fun(data_fvec0, data2_fvec0); fVec output_fvec1 = vec_fun(data_fvec1, data2_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -490,14 +479,11 @@ inline void map3( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data1_bvec = bVec::loadu(input_data1 + d); - fVec data1_fvec0, data1_fvec1; - std::tie(data1_fvec0, data1_fvec1) = convert_to_float(data1_bvec); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(input_data3 + d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -505,14 +491,11 @@ inline void map3( } if (size - d > 0) { bVec data1_bvec = bVec::loadu(input_data1 + d, size - d); - fVec data1_fvec0, data1_fvec1; - std::tie(data1_fvec0, data1_fvec1) = convert_to_float(data1_bvec); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(input_data3 + d, size - d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -535,17 +518,13 @@ inline void map4( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec data1_bvec = bVec::loadu(input_data1 + d); - fVec data1_fvec0, data1_fvec1; - std::tie(data1_fvec0, data1_fvec1) = convert_to_float(data1_bvec); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(input_data3 + d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); bVec data4_bvec = bVec::loadu(input_data4 + d); - fVec data4_fvec0, data4_fvec1; - std::tie(data4_fvec0, data4_fvec1) = convert_to_float(data4_bvec); + auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); @@ -553,17 +532,13 @@ inline void map4( } if (size - d > 0) { bVec data1_bvec = bVec::loadu(input_data1 + d, size - d); - fVec data1_fvec0, data1_fvec1; - std::tie(data1_fvec0, data1_fvec1) = convert_to_float(data1_bvec); + auto [data1_fvec0, data1_fvec1] = convert_to_float(data1_bvec); bVec data2_bvec = bVec::loadu(input_data2 + d, size - d); - fVec data2_fvec0, data2_fvec1; - std::tie(data2_fvec0, data2_fvec1) = convert_to_float(data2_bvec); + auto [data2_fvec0, data2_fvec1] = convert_to_float(data2_bvec); bVec data3_bvec = bVec::loadu(input_data3 + d, size - d); - fVec data3_fvec0, data3_fvec1; - std::tie(data3_fvec0, data3_fvec1) = convert_to_float(data3_bvec); + auto [data3_fvec0, data3_fvec1] = convert_to_float(data3_bvec); bVec data4_bvec = bVec::loadu(input_data4 + d, size - d); - fVec data4_fvec0, data4_fvec1; - std::tie(data4_fvec0, data4_fvec1) = convert_to_float(data4_bvec); + auto [data4_fvec0, data4_fvec1] = convert_to_float(data4_bvec); fVec output_fvec0 = vec_fun(data1_fvec0, data2_fvec0, data3_fvec0, data4_fvec0); fVec output_fvec1 = vec_fun(data1_fvec1, data2_fvec1, data3_fvec1, data4_fvec1); bVec output_bvec = convert_from_float(output_fvec0, output_fvec1); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256.h b/aten/src/ATen/cpu/vec/vec256/vec256.h index 8127ddd4a9a4f..84c3e8b6e5ce4 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256.h @@ -9,6 +9,7 @@ #if !(defined(__VSX__) || defined(CPU_CAPABILITY_VSX) || defined(CPU_CAPABILITY_ZVECTOR)) #include #include +#include #include #include #include @@ -22,6 +23,9 @@ #include #endif +#include +#include + #include #include #include @@ -69,7 +73,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { } -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX2) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -94,7 +98,8 @@ inline Vectorized cast(const Vectorized& src) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. template std::enable_if_t> inline gather(const double* base_addr, const Vectorized& vindex) { @@ -106,9 +111,10 @@ std::enable_if_t& vindex) { return _mm256_i32gather_ps(base_addr, vindex, scale); } - +#endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, @@ -122,7 +128,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, const Vectorized& vindex, Vectorized& mask) { return _mm256_mask_i32gather_ps(src, base_addr, vindex, mask, scale); } - +#endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Only works for inputs in the range: [-2^51, 2^51] @@ -143,6 +149,24 @@ inline convert_to_int_of_same_size(const Vectorized &src) { return _mm256_cvttps_epi32(src); } +// Only works for inputs in the range: [-2^51, 2^51] +// From: https://stackoverflow.com/a/41148578 +template<> +Vectorized +inline convert_to_fp_of_same_size(const Vectorized &src) { + auto x = _mm256_add_epi64(src, _mm256_castpd_si256(_mm256_set1_pd(0x0018000000000000))); + return _mm256_sub_pd( + _mm256_castsi256_pd(x), + _mm256_set1_pd(0x0018000000000000) + ); +} + +template<> +Vectorized +inline convert_to_fp_of_same_size(const Vectorized &src) { + return _mm256_cvtepi32_ps(src); +} + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> @@ -284,6 +308,6 @@ inline Vectorized flip(const Vectorized & v) { return flip8(v); } -#endif // (defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#endif // (defined(CPU_CAPABILITY_AVX2) }} // namepsace at::vec::CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h index 5e302be45acce..19e0320d8abf6 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_bfloat16.h @@ -7,7 +7,8 @@ #include #include -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS #include #endif @@ -18,7 +19,18 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) + +#ifndef SLEEF_CONST +#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#define SLEEF_CONST const +#else +#define SLEEF_CONST +#endif +#define SLEEF_CONST_OLD SLEEF_CONST +#else +#define SLEEF_CONST_OLD +#endif // bfloat16 conversion static inline void cvtbf16_fp32(const __m128i& a, __m256& o) { @@ -31,6 +43,28 @@ static inline void cvtbf16_fp32(const __m256i& a, __m256& o1, __m256& o2) { cvtbf16_fp32(lo, o1); cvtbf16_fp32(hi, o2); } + +static inline __m128i cvtfp32_bf16(const __m256& src) { + __m256i value = _mm256_castps_si256(src); + __m256i nan = _mm256_set1_epi32(0xffff); + __m256i mask = _mm256_castps_si256(_mm256_cmp_ps(src, src, _CMP_ORD_Q)); + __m256i ones = _mm256_set1_epi32(0x1); + __m256i vec_bias = _mm256_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm256_and_si256(_mm256_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm256_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm256_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm256_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm256_blendv_epi8(nan, t_value, mask); + t_value = _mm256_packus_epi32(t_value, t_value); // t[4-7] t[4-7] t[0-4] t[0-4] + t_value = _mm256_permute4x64_epi64(t_value, 0xd8); // 11 01 10 00 + return _mm256_castsi256_si128(t_value); +} + static inline __m256i cvtfp32_bf16(const __m256& a, const __m256& b) { __m256i lo = _mm256_castps_si256(a); __m256i hi = _mm256_castps_si256(b); @@ -80,6 +114,11 @@ static inline void cvtfp16_fp32(const __m256i& a, __m256& o1, __m256& o2) { cvtfp16_fp32(hi, o2); } +static inline __m128i cvtfp32_fp16(const __m256& src) { + return _mm256_cvtps_ph( + src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + static inline __m256i cvtfp32_fp16(const __m256& a, const __m256& b) { __m128i lo = _mm256_cvtps_ph( a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); @@ -265,7 +304,8 @@ static_assert( } return b; } - Vectorized map(const __m256 (*const vop)(__m256)) const { + + Vectorized map(SLEEF_CONST __m256 (*SLEEF_CONST_OLD vop)(__m256)) const { __m256 lo, hi; cvt_to_fp32(values, lo, hi); const auto o1 = vop(lo); @@ -285,14 +325,14 @@ static_assert( Vectorized angle() const { __m256 lo, hi; cvt_to_fp32(values, lo, hi); - auto angle_lambda = [](__m256 values) { + auto angle_lambda = [](__m256 values_2) { const auto zero_vec = _mm256_set1_ps(0.f); const auto nan_vec = _mm256_set1_ps(NAN); - const auto not_nan_mask = _mm256_cmp_ps(values, values, _CMP_EQ_OQ); + const auto not_nan_mask = _mm256_cmp_ps(values_2, values_2, _CMP_EQ_OQ); const auto nan_mask = _mm256_cmp_ps(not_nan_mask, zero_vec, _CMP_EQ_OQ); const auto pi = _mm256_set1_ps(c10::pi); - const auto neg_mask = _mm256_cmp_ps(values, zero_vec, _CMP_LT_OQ); + const auto neg_mask = _mm256_cmp_ps(values_2, zero_vec, _CMP_LT_OQ); auto angle = _mm256_blendv_ps(zero_vec, pi, neg_mask); angle = _mm256_blendv_ps(angle, nan_vec, nan_mask); return angle; @@ -313,6 +353,9 @@ static_assert( Vectorized acos() const { return map(Sleef_acosf8_u10); } + Vectorized acosh() const { + return map(Sleef_acoshf8_u10); + } Vectorized asin() const { return map(Sleef_asinf8_u10); } @@ -1023,7 +1066,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); CONVERT_VECTORIZED_INIT(Half, half); -#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#else // defined(CPU_CAPABILITY_AVX2) #define CONVERT_NON_VECTORIZED_INIT(type, name) \ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ @@ -1046,11 +1089,49 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V return Vectorized::loadu(arr2); \ } CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); +#if defined(__aarch64__) && !defined(C10_MOBILE) && !defined(__CUDACC__) +inline std::tuple, Vectorized> convert_half_float(const Vectorized& a) { + static_assert(Vectorized::size() == 2 * Vectorized::size()); +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + float16x8x2_t arr = a; + float16x8_t x = arr.val[0]; + float16x8_t y = arr.val[1]; +#else + auto arr = reinterpret_cast(a.operator const Half*()); + float16x8_t x = vld1q_f16(arr); + float16x8_t y = vld1q_f16(arr + Vectorized::size()); +#endif + float32x4_t x1 = vcvt_f32_f16(vget_low_f16(x)); + float32x4_t x2 = vcvt_f32_f16(vget_high_f16(x)); + float32x4_t y1 = vcvt_f32_f16(vget_low_f16(y)); + float32x4_t y2 = vcvt_f32_f16(vget_high_f16(y)); + return { Vectorized(x1, x2), Vectorized(y1, y2) }; +} +inline Vectorized convert_float_half(const Vectorized& a, const Vectorized& b) { + static_assert(Vectorized::size() == 2 * Vectorized::size()); + float32x4x2_t x = a; + float32x4x2_t y = b; + float16x4_t x1 = vcvt_f16_f32(x.val[0]); + float16x4_t x2 = vcvt_f16_f32(x.val[1]); + float16x4_t y1 = vcvt_f16_f32(y.val[0]); + float16x4_t y2 = vcvt_f16_f32(y.val[1]); +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + return Vectorized(vcombine_f16(x1, x2), vcombine_f16(y1, y2)); +#else + Vectorized rc; + auto arr = reinterpret_cast(rc.operator Half*()); + vst1q_f16(arr, vcombine_f16(x1, x2)); + vst1q_f16(arr + Vectorized::size(), vcombine_f16(y1, y2)); + return rc; +#endif +} +#else CONVERT_NON_VECTORIZED_INIT(Half, half); +#endif -#endif // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#endif // defined(CPU_CAPABILITY_AVX2) -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) #define LOAD_FP32_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ auto values = _mm_loadu_si128(reinterpret_cast(data)); \ @@ -1069,7 +1150,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); LOAD_FP32_VECTORIZED_INIT(Half, fp16); -#else // defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#else // defined(CPU_CAPABILITY_AVX2) #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ __at_align__ float values[Vectorized::size()]; \ diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h index f93ea1e63c38d..6c198fb37d3d1 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_double.h @@ -8,7 +8,8 @@ #include #include -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS #include #endif @@ -16,7 +17,7 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) template <> class Vectorized> { private: @@ -145,7 +146,7 @@ template <> class Vectorized> { auto abs = abs_(); auto zero = _mm256_setzero_pd(); auto mask = _mm256_cmp_pd(abs, zero, _CMP_EQ_OQ); - auto div = values / abs; + auto div = _mm256_div_pd(values, abs); return _mm256_blendv_pd(div, zero, mask); } __m256d real_() const { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h index 7c142c04b79c0..c72d4d49274a0 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_complex_float.h @@ -7,7 +7,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS #include #endif @@ -15,7 +16,7 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) template <> class Vectorized> { private: @@ -180,7 +181,7 @@ template <> class Vectorized> { auto abs = abs_(); auto zero = _mm256_setzero_ps(); auto mask = _mm256_cmp_ps(abs, zero, _CMP_EQ_OQ); - auto div = values / abs; + auto div = _mm256_div_ps(values, abs); return _mm256_blendv_ps(div, zero, mask); } __m256 real_() const { diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_convert.h b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h new file mode 100644 index 0000000000000..55f26c606d8bd --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec256/vec256_convert.h @@ -0,0 +1,215 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + __m256 value; + cvtbf16_fp32(_mm256_castsi256_si128(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + __m256 value; + cvtfp16_fp32(_mm256_castsi256_si128(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = _mm256_castsi128_si256(cvtfp32_bf16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = _mm256_castsi128_si256(cvtfp32_fp16(src[0])); + return result; + } +}; + +template <> +inline Vectorized convert_to_fp_of_same_size( + const Vectorized& src); + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low_double = at::vec::convert_to_fp_of_same_size(src[0]); + auto low = _mm256_cvtpd_ps(low_double); + auto high_double = at::vec::convert_to_fp_of_same_size(src[1]); + auto high = _mm256_cvtpd_ps(high_double); + return Vectorized( + _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1)); + } +}; + +template <> +inline Vectorized convert_to_int_of_same_size( + const Vectorized& src); + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + auto int32_vec = at::vec::convert_to_int_of_same_size(src[0]); + result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(int32_vec)); + result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(int32_vec, 1)); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm256_shuffle_epi32(src[0], _MM_SHUFFLE(2, 0, 2, 0)); + auto high = _mm256_shuffle_epi32(src[1], _MM_SHUFFLE(2, 0, 2, 0)); + auto low_perm = _mm256_permute4x64_epi64(low, _MM_SHUFFLE(3, 1, 2, 0)); + auto high_perm = _mm256_permute4x64_epi64(high, _MM_SHUFFLE(3, 1, 2, 0)); + return Vectorized(_mm256_blend_epi32(low_perm, high_perm, 0xF0)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm256_cvtepi32_epi64(_mm256_castsi256_si128(src[0])); + result[1] = _mm256_cvtepi32_epi64(_mm256_extracti128_si256(src[0], 1)); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm256_castsi256_si128(src[0]); + return Vectorized(_mm256_cvtepi8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm256_castsi256_si128(src[0]); + return Vectorized(_mm256_cvtepu8_epi32(src128)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + src_t, + 1, + typename std::enable_if_t< + (is_reduced_floating_point_v && is_8bit_integer_v) || + (is_reduced_floating_point_v && is_8bit_integer_v), + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN tmp_fp32 = VecConvert::apply(src); + return VecConvert::apply(tmp_fp32); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_float_to_int8(src[0]); + } +}; + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_int8_to_float(src[0]); + } +}; + +template +struct VecConvert< + dst_t, + 1, + int64_t, + 2, + typename std::enable_if< + std::is_same_v || + std::is_same_v>::type> { + static inline VectorizedN apply( + const VectorizedN& src) { + return VecConvert::apply( + VecConvert::apply(src)); + } +}; + +#endif + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + auto [res_vec1, res_vec2] = convert_to_float(src[0]); + return res_vec1; + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_from_float(src[0], src[0]); + } +}; + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_double.h b/aten/src/ATen/cpu/vec/vec256/vec256_double.h index 612f1ac6d21ba..bed6da627af2d 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_double.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_double.h @@ -6,7 +6,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS #include #endif @@ -15,7 +16,7 @@ namespace at::vec { inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) template <> class Vectorized { private: @@ -100,6 +101,10 @@ template <> class Vectorized { Vectorized isnan() const { return _mm256_cmp_pd(values, _mm256_set1_pd(0.0), _CMP_UNORD_Q); } + bool has_inf_nan() const { + __m256d self_sub = _mm256_sub_pd(values, values); + return (_mm256_movemask_epi8(_mm256_castpd_si256(self_sub)) & 0x77777777) != 0; + } Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; store(tmp); @@ -136,6 +141,9 @@ template <> class Vectorized { Vectorized acos() const { return Vectorized(Sleef_acosd4_u10(values)); } + Vectorized acosh() const { + return Vectorized(Sleef_acoshd4_u10(values)); + } Vectorized asin() const { return Vectorized(Sleef_asind4_u10(values)); } diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float.h b/aten/src/ATen/cpu/vec/vec256/vec256_float.h index 2ec41d7593da8..0e3664cd37b6a 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float.h @@ -6,7 +6,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#define SLEEF_STATIC_LIBS #include #endif @@ -14,7 +15,7 @@ namespace at::vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) template <> class Vectorized { private: @@ -106,6 +107,12 @@ template <> class Vectorized { Vectorized isnan() const { return _mm256_cmp_ps(values, _mm256_set1_ps(0.0f), _CMP_UNORD_Q); } + + bool has_inf_nan() const { + __m256 self_sub = _mm256_sub_ps(values, values); + return (_mm256_movemask_epi8(_mm256_castps_si256(self_sub)) & 0x77777777) != 0; + } + Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); @@ -142,6 +149,9 @@ template <> class Vectorized { Vectorized acos() const { return Vectorized(Sleef_acosf8_u10(values)); } + Vectorized acosh() const { + return Vectorized(Sleef_acoshf8_u10(values)); + } Vectorized asin() const { return Vectorized(Sleef_asinf8_u10(values)); } @@ -217,14 +227,14 @@ template <> class Vectorized { static __m256 vec_factorial_5 = _mm256_set1_ps(0.00828929059f); // 1/factorial(5) static __m256 vec_exp_log2ef = - (__m256)_mm256_set1_epi32(0x3fb8aa3b); // log2(e) + _mm256_castsi256_ps(_mm256_set1_epi32(0x3fb8aa3b)); // log2(e) static __m256 vec_half = _mm256_set1_ps(0.5f); static __m256 vec_one = _mm256_set1_ps(1.f); static __m256 vec_zero = _mm256_set1_ps(0.f); static __m256 vec_two = _mm256_set1_ps(2.f); - static __m256 vec_ln2f = (__m256)_mm256_set1_epi32(0x3f317218); // ln(2) - static __m256 vec_ln_flt_min = (__m256)_mm256_set1_epi32(0xc2aeac50); - static __m256 vec_ln_flt_max = (__m256)_mm256_set1_epi32(0x42b17218); + static __m256 vec_ln2f = _mm256_castsi256_ps(_mm256_set1_epi32(0x3f317218)); // ln(2) + static __m256 vec_ln_flt_min = _mm256_castsi256_ps(_mm256_set1_epi32(0xc2aeac50)); + static __m256 vec_ln_flt_max = _mm256_castsi256_ps(_mm256_set1_epi32(0x42b17218)); static __m256i vec_127 = _mm256_set1_epi32(0x0000007f); static int n_mantissa_bits = 23; @@ -257,7 +267,7 @@ template <> class Vectorized { auto vec_exp_number_i = _mm256_cvtps_epi32(vec_exp_number); auto vec_two_pow_n_i = _mm256_add_epi32(vec_exp_number_i, vec_127); vec_two_pow_n_i = _mm256_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); - auto vec_two_pow_n = (__m256)vec_two_pow_n_i; + auto vec_two_pow_n = _mm256_castsi256_ps(vec_two_pow_n_i); vec_two_pow_n = _mm256_blendv_ps(vec_two_pow_n, vec_zero, less_ln_flt_min_mask); diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h index bf16d7236e50a..a5b993f2b9e10 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_float_neon.h @@ -307,6 +307,16 @@ template <> class Vectorized { } return loadu(res); }; + bool has_inf_nan() const { + __at_align__ float tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if(_isnan(tmp[i]) || _isinf(tmp[i])) { + return true; + } + } + return false; + } Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); @@ -339,6 +349,12 @@ template <> class Vectorized { map(std::acos) ); } + Vectorized acosh() const { + return USE_SLEEF( + Vectorized(Sleef_acoshf4_u10(values.val[0]), Sleef_acoshf4_u10(values.val[1])), + map(std::acosh) + ); + } Vectorized asin() const { return USE_SLEEF( Vectorized(Sleef_asinf4_u10(values.val[0]), Sleef_asinf4_u10(values.val[1])), diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h new file mode 100644 index 0000000000000..aaf1d5995fc05 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec256/vec256_half_neon.h @@ -0,0 +1,819 @@ +#pragma once + +// DO NOT DEFINE STATIC DATA IN THIS HEADER! +// See Note [Do not compile initializers with AVX] + +#include +#include +#include +#include +#include + +namespace at::vec { +// See Note [CPU_CAPABILITY namespace] +inline namespace CPU_CAPABILITY { + +// Right now contains only aarch64 implementation. +// Due to follow two reasons aarch32 is not currently supported. +// 1. Due to difference in ISA been aarch32 and aarch64, intrinsics +// that work for aarch64 dont work for aarch32. +// 2. Android NDK r21 has problems with compiling aarch32. +// Clang seg faults. +// https://github.com/android/ndk/issues/1248 +// https://bugs.llvm.org/show_bug.cgi?id=45824 +// Most likely we will do aarch32 support with inline asm. +#if !defined(C10_MOBILE) && defined(__aarch64__) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) + +#ifdef __BIG_ENDIAN__ +#error "Big endian is not supported." +#endif + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res); +}; + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res) { + return vsetq_lane_f16(vgetq_lane_f16(b, index), res, index); + } +}; + +template +struct BlendHalfRegs { + static float16x8_t impl( + const float16x8_t& a, + const float16x8_t& b, + float16x8_t& res) { + return vsetq_lane_f16(vgetq_lane_f16(a, index), res, index); + } +}; + +// On ARM, Half type supports float16_t->Half constructor and Half->float16_t +// conversion +template <> +class Vectorized { + private: + float16x8x2_t values; + + public: + // value_type should be c10::Half to fit interface with vec_base.h + using value_type = c10::Half; + using size_type = int; + static constexpr size_type size() { + static_assert(sizeof(float16x8x2_t) == 16 * sizeof(value_type)); + return 16; + } + + private: + // We use these private map functions to implement various methods + Vectorized map2( + const Vectorized& second, + c10::Half (*const f)(c10::Half, c10::Half)) const { + __at_align__ c10::Half tmp_first[size()]; + __at_align__ c10::Half tmp_second[size()]; + store(tmp_first); // store this to tmp_first + second.store(tmp_second); + for (const auto i : c10::irange(size())) { + tmp_first[i] = f(tmp_first[i], tmp_second[i]); + } + return loadu(tmp_first); + } + + Vectorized map_with_vec_float_method( + Vectorized (Vectorized::*m)() const) const { + // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert + // back + float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0])); + float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0])); + Vectorized mv0 = (Vectorized(v00, v01).*m)(); + float16x4_t r00 = vcvt_f16_f32(mv0.get_low()); + float16x4_t r01 = vcvt_f16_f32(mv0.get_high()); + + // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert + // back + float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1])); + float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1])); + Vectorized mv1 = (Vectorized(v10, v11).*m)(); + float16x4_t r10 = vcvt_f16_f32(mv1.get_low()); + float16x4_t r11 = vcvt_f16_f32(mv1.get_high()); + + // Pack result into Vectorized + return Vectorized( + vcombine_f16(r00, r01), vcombine_f16(r10, r11)); + } + + Vectorized map2_with_vec_float_method( + const Vectorized& second, + Vectorized (Vectorized::*m)(const Vectorized&) + const) const { + // Convert low float16x8_t to 2 float32x4_t variables, apply m, and convert + // back + float32x4_t v00 = vcvt_f32_f16(vget_low_f16(values.val[0])); + float32x4_t v01 = vcvt_f32_f16(vget_high_f16(values.val[0])); + float32x4_t second_v00 = vcvt_f32_f16(vget_low_f16(second.get_low())); + float32x4_t second_v01 = vcvt_f32_f16(vget_high_f16(second.get_low())); + Vectorized mv0 = (Vectorized(v00, v01).*m)( + Vectorized(second_v00, second_v01)); + float16x4_t r00 = vcvt_f16_f32(mv0.get_low()); + float16x4_t r01 = vcvt_f16_f32(mv0.get_high()); + + // Convert high float16x8_t to 2 float32x4_t variables, apply m, and convert + // back + float32x4_t v10 = vcvt_f32_f16(vget_low_f16(values.val[1])); + float32x4_t v11 = vcvt_f32_f16(vget_high_f16(values.val[1])); + float32x4_t second_v10 = vcvt_f32_f16(vget_low_f16(second.get_high())); + float32x4_t second_v11 = vcvt_f32_f16(vget_high_f16(second.get_high())); + Vectorized mv1 = (Vectorized(v10, v11).*m)( + Vectorized(second_v10, second_v11)); + float16x4_t r10 = vcvt_f16_f32(mv1.get_low()); + float16x4_t r11 = vcvt_f16_f32(mv1.get_high()); + + // Pack result into Vectorized + return Vectorized( + vcombine_f16(r00, r01), vcombine_f16(r10, r11)); + } + + public: + // constructor + Vectorized() {} + Vectorized(float16x8x2_t v) : values(v) {} + + // A ctor that accepts c10::Half is needed to fit interface with vec_base.h + // A second constructor that takes float16_t is also included + Vectorized(c10::Half val) + : values{vdupq_n_f16((float16_t)val), vdupq_n_f16((float16_t)val)} { + } + Vectorized(float16_t val) : values{vdupq_n_f16(val), vdupq_n_f16(val)} {} + Vectorized( + float16_t val0, + float16_t val1, + float16_t val2, + float16_t val3, + float16_t val4, + float16_t val5, + float16_t val6, + float16_t val7, + float16_t val8, + float16_t val9, + float16_t val10, + float16_t val11, + float16_t val12, + float16_t val13, + float16_t val14, + float16_t val15) + : values{ + val0, + val1, + val2, + val3, + val4, + val5, + val6, + val7, + val8, + val9, + val10, + val11, + val12, + val13, + val14, + val15} {} + Vectorized(float16x8_t val0, float16x8_t val1) : values{val0, val1} {} + operator float16x8x2_t() const { + return values; + } + template + static Vectorized blend( + const Vectorized& a, + const Vectorized& b) { + Vectorized vec; + // 0. + vec.values.val[0] = BlendHalfRegs<0, (mask & 0x01) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<1, (mask & 0x02) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<2, (mask & 0x04) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<3, (mask & 0x08) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + + vec.values.val[0] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + vec.values.val[0] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl( + a.values.val[0], b.values.val[0], vec.values.val[0]); + + // 1. + vec.values.val[1] = BlendHalfRegs<0, (mask & 0x10) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<1, (mask & 0x20) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<2, (mask & 0x40) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<3, (mask & 0x80) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + + vec.values.val[1] = BlendHalfRegs<4, (mask & 0x10) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<5, (mask & 0x20) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<6, (mask & 0x40) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + vec.values.val[1] = BlendHalfRegs<7, (mask & 0x80) != 0>::impl( + a.values.val[1], b.values.val[1], vec.values.val[1]); + + return vec; + } + static Vectorized blendv( + const Vectorized& a, + const Vectorized& b, + const Vectorized& mask) { + // Note: using blendv is very awkward because 0xFFFF is one of many NaN's in + // FP16 It's unfortunate that the mask has type Half (required from + // vec_base) + + // TODO + // NB: This requires that each value, i.e., each uint value, + // of the mask either all be zeros or all be 1s. + // We perhaps need some kind of an assert? + // But that will affect performance. + Vectorized vec(mask.values); + vec.values.val[0] = vbslq_f16( + vreinterpretq_u16_f16(vec.values.val[0]), + b.values.val[0], + a.values.val[0]); + vec.values.val[1] = vbslq_f16( + vreinterpretq_u16_f16(vec.values.val[1]), + b.values.val[1], + a.values.val[1]); + return vec; + } + template + static Vectorized arange( + c10::Half base = 0.0, + step_t step = static_cast(1)) { + const Vectorized base_vec(base); + const Vectorized step_vec(step); + const Vectorized step_sizes( + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); + return fmadd(step_sizes, step_vec, base_vec); + } + static Vectorized set( + const Vectorized& a, + const Vectorized& b, + int64_t count = size()) { + uint16_t pre_mask[16] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + for (int i = 0; i < count; i++) { + pre_mask[i] = 0xFFFF; + } + uint16x8x2_t mask = vld1q_u16_x2(pre_mask); + + // Using blendv is awkward because 0xFFFF is one of many NaN's in FP16 + // so we directly use vbslq_f16 instead + Vectorized vec( + vbslq_f16( + // Low bits + mask.val[0], + b.values.val[0], + a.values.val[0]), + // High bits + vbslq_f16(mask.val[1], b.values.val[1], a.values.val[1])); + + return vec; + } + static Vectorized loadu(const void* ptr, int64_t count = size()) { + if (count == size()) { + return vld1q_f16_x2(reinterpret_cast(ptr)); + } else if (count == (size() >> 1)) { + Vectorized res; + res.values.val[0] = vld1q_f16(reinterpret_cast(ptr)); + res.values.val[1] = vdupq_n_f16(0); + return res; + } else { + __at_align__ float16_t tmp_values[size()]; + for (const auto i : c10::irange(size())) { + tmp_values[i] = 0; + } + std::memcpy( + tmp_values, + reinterpret_cast(ptr), + count * sizeof(float16_t)); + return vld1q_f16_x2(reinterpret_cast(tmp_values)); + } + } + void store(void* ptr, int64_t count = size()) const { + if (count == size()) { + vst1q_f16_x2(reinterpret_cast(ptr), values); + return; + } else if (count == (size() >> 1)) { + vst1q_f16(reinterpret_cast(ptr), values.val[0]); + } else { + float16_t tmp_values[size()]; + vst1q_f16_x2(reinterpret_cast(tmp_values), values); + std::memcpy(ptr, tmp_values, count * sizeof(float16_t)); + } + } + inline const float16x8_t& get_low() const { + return values.val[0]; + } + inline float16x8_t& get_low() { + return values.val[0]; + } + inline const float16x8_t& get_high() const { + return values.val[1]; + } + inline float16x8_t& get_high() { + return values.val[1]; + } + // Very slow implementation of indexing. + // Only required because vec256_qint refers to this. + // Once we specialize that implementation for ARM + // this should be removed. TODO (kimishpatel) + c10::Half operator[](int idx) const { + __at_align__ c10::Half tmp[size()]; + store(tmp); + return tmp[idx]; + } + c10::Half operator[](int idx) { + __at_align__ c10::Half tmp[size()]; + store(tmp); + return tmp[idx]; + } + // For boolean version where we want to if any 1/all zero + // etc. can be done faster in a different way. + int zero_mask() const { + __at_align__ c10::Half tmp[size()]; + store(tmp); + int mask = 0; + for (int i = 0; i < size(); ++i) { + if (tmp[i] == 0) { + mask |= (1 << i); + } + } + return mask; + } + Vectorized isnan() const { + __at_align__ c10::Half tmp[size()]; + __at_align__ c10::Half res[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i])) { + std::memset(static_cast(&res[i]), 0xFF, sizeof(c10::Half)); + } else { + std::memset(static_cast(&res[i]), 0, sizeof(c10::Half)); + } + } + return loadu(res); + }; + bool has_inf_nan() const { + __at_align__ c10::Half tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + if (_isnan(tmp[i]) || _isinf(tmp[i])) { + return true; + } + } + return false; + } + Vectorized map(c10::Half (*const f)(c10::Half)) const { + __at_align__ c10::Half tmp[size()]; + store(tmp); + for (const auto i : c10::irange(size())) { + tmp[i] = f(tmp[i]); + } + return loadu(tmp); + } + Vectorized abs() const { + return Vectorized( + vabsq_f16(values.val[0]), vabsq_f16(values.val[1])); + } + Vectorized angle() const { + auto zero = Vectorized(0); + auto pi = Vectorized(c10::pi); + auto tmp = blendv(zero, pi, *this < zero); + return blendv(tmp, *this, isnan()); + } + Vectorized real() const { + return *this; + } + Vectorized imag() const { + return Vectorized(0); + } + Vectorized conj() const { + return *this; + } + + // Sleef does not support FP16, so many math functions are applied by + // converting to FP32, applying the math function, and then converting back to + // FP16. + Vectorized acos() const { + return map_with_vec_float_method(&Vectorized::acos); + } + Vectorized acosh() const { + return map_with_vec_float_method(&Vectorized::acosh); + } + Vectorized asin() const { + return map_with_vec_float_method(&Vectorized::asin); + } + Vectorized atan() const { + return map_with_vec_float_method(&Vectorized::atan); + } + Vectorized atanh() const { + return map_with_vec_float_method(&Vectorized::atanh); + } + Vectorized atan2(const Vectorized& exp) const { + return map2_with_vec_float_method(exp, &Vectorized::atan2); + } + Vectorized copysign(const Vectorized& sign) const { + return map2_with_vec_float_method(sign, &Vectorized::copysign); + } + Vectorized erf() const { + return map_with_vec_float_method(&Vectorized::erf); + } + Vectorized erfc() const { + return map_with_vec_float_method(&Vectorized::erfc); + } + Vectorized erfinv() const { + return map_with_vec_float_method(&Vectorized::erfinv); + } + Vectorized exp() const { + return map_with_vec_float_method(&Vectorized::exp); + } + Vectorized exp2() const { + return map_with_vec_float_method(&Vectorized::exp2); + } + Vectorized expm1() const { + return map_with_vec_float_method(&Vectorized::expm1); + } + Vectorized exp_u20() const { + return map_with_vec_float_method(&Vectorized::exp_u20); + } + Vectorized fmod(const Vectorized& q) const { + // This function is questionable with a conversion, so we use map2 + return map2(q, std::fmod); + } + Vectorized hypot(const Vectorized& b) const { + return map2_with_vec_float_method(b, &Vectorized::hypot); + } + Vectorized i0() const { + return map_with_vec_float_method(&Vectorized::i0); + } + Vectorized i0e() const { + return map_with_vec_float_method(&Vectorized::i0e); + } + Vectorized digamma() const { + return map_with_vec_float_method(&Vectorized::digamma); + } + Vectorized igamma(const Vectorized& x) const { + return map2_with_vec_float_method(x, &Vectorized::igamma); + } + Vectorized igammac(const Vectorized& x) const { + return map2_with_vec_float_method(x, &Vectorized::igammac); + } + Vectorized log() const { + return map_with_vec_float_method(&Vectorized::log); + } + Vectorized log10() const { + return map_with_vec_float_method(&Vectorized::log10); + } + Vectorized log1p() const { + return map_with_vec_float_method(&Vectorized::log1p); + } + Vectorized log2() const { + return map_with_vec_float_method(&Vectorized::log2); + } + Vectorized nextafter(const Vectorized& b) const { + // This function does not make sense with conversion, so we use map2 + return map2(b, std::nextafter); + } + Vectorized frac() const; + Vectorized sin() const { + return map_with_vec_float_method(&Vectorized::sin); + } + Vectorized sinh() const { + return map_with_vec_float_method(&Vectorized::sinh); + } + Vectorized cos() const { + return map_with_vec_float_method(&Vectorized::cos); + } + Vectorized cosh() const { + return map_with_vec_float_method(&Vectorized::cosh); + } + Vectorized ceil() const { + // This function is questionable with a conversion, so we use map + return map(at::native::ceil_impl); + } + Vectorized floor() const { + // This function is questionable with a conversion, so we use map + return map(at::native::floor_impl); + } + Vectorized neg() const { + return Vectorized( + vnegq_f16(values.val[0]), vnegq_f16(values.val[1])); + } + inline Vectorized round() const { + // This function is questionable with a conversion, so we use map + return map(at::native::round_impl); + } + inline Vectorized tan() const { + return map_with_vec_float_method(&Vectorized::tan); + } + inline Vectorized tanh() const { + return map_with_vec_float_method(&Vectorized::tanh); + } + Vectorized trunc() const { + float16x8_t r0 = vrndq_f16(values.val[0]); + float16x8_t r1 = vrndq_f16(values.val[1]); + return Vectorized(r0, r1); + } + Vectorized lgamma() const { + return map_with_vec_float_method(&Vectorized::lgamma); + } + Vectorized sqrt() const { + return Vectorized( + vsqrtq_f16(values.val[0]), vsqrtq_f16(values.val[1])); + } + Vectorized reciprocal() const { + auto ones = vdupq_n_f16(1.0f); + auto r0 = vdivq_f16(ones, values.val[0]); + auto r1 = vdivq_f16(ones, values.val[1]); + return Vectorized(r0, r1); + } + Vectorized rsqrt() const { + return this->sqrt().reciprocal(); + } + Vectorized pow(const Vectorized& exp) const { + return map2_with_vec_float_method(exp, &Vectorized::pow); + } + Vectorized operator==(const Vectorized& other) const { + float16x8_t r0 = + vreinterpretq_f16_u16(vceqq_f16(values.val[0], other.values.val[0])); + float16x8_t r1 = + vreinterpretq_f16_u16(vceqq_f16(values.val[1], other.values.val[1])); + return Vectorized(r0, r1); + } + + Vectorized operator!=(const Vectorized& other) const { + float16x8_t r0 = vreinterpretq_f16_u16( + vmvnq_u16(vceqq_f16(values.val[0], other.values.val[0]))); + float16x8_t r1 = vreinterpretq_f16_u16( + vmvnq_u16(vceqq_f16(values.val[1], other.values.val[1]))); + return Vectorized(r0, r1); + } + + Vectorized operator<(const Vectorized& other) const { + float16x8_t r0 = + vreinterpretq_f16_u16(vcltq_f16(values.val[0], other.values.val[0])); + float16x8_t r1 = + vreinterpretq_f16_u16(vcltq_f16(values.val[1], other.values.val[1])); + return Vectorized(r0, r1); + } + + Vectorized operator<=(const Vectorized& other) const { + float16x8_t r0 = + vreinterpretq_f16_u16(vcleq_f16(values.val[0], other.values.val[0])); + float16x8_t r1 = + vreinterpretq_f16_u16(vcleq_f16(values.val[1], other.values.val[1])); + return Vectorized(r0, r1); + } + + Vectorized operator>(const Vectorized& other) const { + float16x8_t r0 = + vreinterpretq_f16_u16(vcgtq_f16(values.val[0], other.values.val[0])); + float16x8_t r1 = + vreinterpretq_f16_u16(vcgtq_f16(values.val[1], other.values.val[1])); + return Vectorized(r0, r1); + } + + Vectorized operator>=(const Vectorized& other) const { + float16x8_t r0 = + vreinterpretq_f16_u16(vcgeq_f16(values.val[0], other.values.val[0])); + float16x8_t r1 = + vreinterpretq_f16_u16(vcgeq_f16(values.val[1], other.values.val[1])); + return Vectorized(r0, r1); + } + + Vectorized eq(const Vectorized& other) const; + Vectorized ne(const Vectorized& other) const; + Vectorized gt(const Vectorized& other) const; + Vectorized ge(const Vectorized& other) const; + Vectorized lt(const Vectorized& other) const; + Vectorized le(const Vectorized& other) const; +}; // Vectorized + +template <> +Vectorized inline operator+( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vaddq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vaddq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline operator-( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vsubq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vsubq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline operator*( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vmulq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vmulq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline operator/( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vdivq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vdivq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +// frac. Implement this here so we can use subtraction +inline Vectorized Vectorized::frac() const { + return *this - this->trunc(); +} + +// Implements the IEEE 754 201X `maximum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline maximum( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vmaxq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vmaxq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +// Implements the IEEE 754 201X `minimum` operation, which propagates NaN if +// either input is a NaN. +template <> +Vectorized inline minimum( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vminq_f16(a.get_low(), b.get_low()); + float16x8_t r1 = vminq_f16(a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline clamp( + const Vectorized& a, + const Vectorized& min, + const Vectorized& max) { + return minimum(max, maximum(min, a)); +} + +template <> +Vectorized inline clamp_max( + const Vectorized& a, + const Vectorized& max) { + return minimum(max, a); +} + +template <> +Vectorized inline clamp_min( + const Vectorized& a, + const Vectorized& min) { + return maximum(min, a); +} + +template <> +Vectorized inline operator&( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vreinterpretq_f16_u16(vandq_u16( + vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low()))); + float16x8_t r1 = vreinterpretq_f16_u16(vandq_u16( + vreinterpretq_u16_f16(a.get_high()), + vreinterpretq_u16_f16(b.get_high()))); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline operator|( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vreinterpretq_f16_u16(vorrq_u16( + vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low()))); + float16x8_t r1 = vreinterpretq_f16_u16(vorrq_u16( + vreinterpretq_u16_f16(a.get_high()), + vreinterpretq_u16_f16(b.get_high()))); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline operator^( + const Vectorized& a, + const Vectorized& b) { + float16x8_t r0 = vreinterpretq_f16_u16(veorq_u16( + vreinterpretq_u16_f16(a.get_low()), vreinterpretq_u16_f16(b.get_low()))); + float16x8_t r1 = vreinterpretq_f16_u16(veorq_u16( + vreinterpretq_u16_f16(a.get_high()), + vreinterpretq_u16_f16(b.get_high()))); + return Vectorized(r0, r1); +} + +inline Vectorized Vectorized::eq( + const Vectorized& other) const { + return (*this == other) & Vectorized(1); +} + +inline Vectorized Vectorized::ne( + const Vectorized& other) const { + return (*this != other) & Vectorized(1); +} + +inline Vectorized Vectorized::gt( + const Vectorized& other) const { + return (*this > other) & Vectorized(1); +} + +inline Vectorized Vectorized::ge( + const Vectorized& other) const { + return (*this >= other) & Vectorized(1); +} + +inline Vectorized Vectorized::lt( + const Vectorized& other) const { + return (*this < other) & Vectorized(1); +} + +inline Vectorized Vectorized::le( + const Vectorized& other) const { + return (*this <= other) & Vectorized(1); +} + +template <> +inline void convert(const float16_t* src, int16_t* dst, int64_t n) { + int64_t i; +#pragma unroll + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_s16(dst + i, vcvtq_s16_f16(vld1q_f16(src + i))); + vst1q_s16(dst + i + 8, vcvtq_s16_f16(vld1q_f16(src + i + 8))); + } +#pragma unroll + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +inline void convert(const int16_t* src, float16_t* dst, int64_t n) { + int64_t i; +#pragma unroll + for (i = 0; i <= (n - Vectorized::size()); + i += Vectorized::size()) { + vst1q_f16(dst + i, vcvtq_f16_s16(vld1q_s16(src + i))); + vst1q_f16(dst + i + 8, vcvtq_f16_s16(vld1q_s16(src + i + 8))); + } +#pragma unroll + for (; i < n; i++) { + dst[i] = static_cast(src[i]); + } +} + +template <> +Vectorized inline fmadd( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + float16x8_t r0 = vfmaq_f16(c.get_low(), a.get_low(), b.get_low()); + float16x8_t r1 = vfmaq_f16(c.get_high(), a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +template <> +Vectorized inline fmsub( + const Vectorized& a, + const Vectorized& b, + const Vectorized& c) { + float16x8_t r0 = vfmsq_f16(c.get_low(), a.get_low(), b.get_low()); + float16x8_t r1 = vfmsq_f16(c.get_high(), a.get_high(), b.get_high()); + return Vectorized(r0, r1); +} + +#endif /* defined(aarch64) && defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && !defined(C10_MOBILE) */ + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_int.h b/aten/src/ATen/cpu/vec/vec256/vec256_int.h index 392a22bee62ae..6263efd2039ce 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_int.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_int.h @@ -494,7 +494,7 @@ class Vectorized : public Vectorizedi { template class Vectorized8 : public Vectorizedi { static_assert( - std::is_same::value || std::is_same::value, + std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); protected: static const Vectorized ones; @@ -1382,7 +1382,7 @@ Vectorized inline shift_256_16(const Vectorized& a, const Vect return c; } -template ::value || std::is_same::value, int> = 0> +template || std::is_same_v, int> = 0> Vectorized inline shift_256_8(const Vectorized& a, const Vectorized& b) { // No vector instruction for shifting int8_t/uint8_t, so emulating // it instead. diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_mask.h b/aten/src/ATen/cpu/vec/vec256/vec256_mask.h new file mode 100644 index 0000000000000..dd6a8c52d8265 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec256/vec256_mask.h @@ -0,0 +1,93 @@ +#pragma once + +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +template +struct VecMaskLoad< + T, + 1, + mask_t, + 1, + typename std::enable_if_t< + std::is_same_v || std::is_same_v || + std::is_same_v, + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + auto int_mask = vec_mask.template cast()[0]; + if constexpr (std::is_same_v) { + return Vectorized(_mm256_maskload_ps(ptr, int_mask)); + } else { + return Vectorized(_mm256_maskload_epi32(ptr, int_mask)); + } + } +}; + +// TODO: add specialization of VecMaskLoad for bfloat16/half and int8/uint8 + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return Vectorized(_mm256_castsi256_ps(vec_mask[0])); + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return Vectorized(_mm256_castps_si256(vec_mask[0])); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int_vec = convert(VectorizedN(vec_mask)); + return VecMask(int_vec).cast(); + } +}; + +template <> +inline bool VecMask::all_zero() const { + return _mm256_testz_si256(mask_[0], mask_[0]); +} + +template <> +inline bool VecMask::is_masked(int i) const { + return _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])) & (1 << i); +} + +template <> +inline bool VecMask::all_masked() const { + int mask = _mm256_movemask_ps(_mm256_castsi256_ps(mask_[0])); + return mask == 0xff; +} + +#define VEC_MASK_METHOD_WITH_CAST_TO_INT( \ + T, N, return_type, method, args_def, args) \ + template <> \ + inline return_type VecMask::method args_def const { \ + return cast().method args; \ + } + +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ()) + +#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h index ee14de69324fa..c1defcdfd5189 100644 --- a/aten/src/ATen/cpu/vec/vec256/vec256_qint.h +++ b/aten/src/ATen/cpu/vec/vec256/vec256_qint.h @@ -41,11 +41,17 @@ namespace at::vec { inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX2) +#ifdef _MSC_VER +__declspec(align(64)) struct Vectorizedqi { + protected: + __m256i vals; +#else struct Vectorizedqi { protected: __m256i vals __attribute__((aligned(64))); +#endif public: Vectorizedqi() {} @@ -96,28 +102,36 @@ inline __m256i pack_saturate_and_clamp( _mm256_min_epu8(packed_and_sat, _mm256_set1_epi8(max_val))); } -inline Vectorized convert_uint8_to_float(at::vec::Vectorized src) { +template +typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> +inline convert_int8_to_float(at::vec::Vectorized src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() - // Only handle first 64 bits + // Only handle first 8*8 bits __m128i input_128 = _mm256_castsi256_si128(src); - // Convert from 8*uint8 to 8*int32 - __m256i input_256_int32 = _mm256_cvtepu8_epi32(input_128); + // Convert from 8*uint8/int8 to 8*int32 + __m256i input_256_int32; + if constexpr (std::is_same_v) + input_256_int32 = _mm256_cvtepu8_epi32(input_128); + else + input_256_int32 = _mm256_cvtepi8_epi32(input_128); // Convert from 8*int32 to 8*float return _mm256_cvtepi32_ps(input_256_int32); } -inline Vectorized convert_float_to_uint8(at::vec::Vectorized src) { +template +typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> +inline convert_float_to_int8(at::vec::Vectorized src) { // Convert from float32 to int32 with truncation __m256i x_values_int32 = _mm256_cvttps_epi32(src); // Convert from int32 to int16 using signed saturation __m256i xy_packed_v = _mm256_packs_epi32(x_values_int32, x_values_int32); - constexpr auto min_val = std::numeric_limits::min(); - constexpr auto max_val = std::numeric_limits::max(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); - // Convert from int16 to uint8 using unsigned saturation - __m256i xyzw_clamped_v = pack_saturate_and_clamp( + // Convert from int16 to uint8/int8 using unsigned saturation + __m256i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); __m256i permute_mask_v = _mm256_set_epi32(0x07, 0x03, 0x06, 0x02, 0x05, 0x01, 0x04, 0x00); @@ -125,7 +139,7 @@ inline Vectorized convert_float_to_uint8(at::vec::Vectorized src } template -inline void __attribute__((always_inline)) QuantizeAvx2( +__FORCE_INLINE void QuantizeAvx2( const float* src, T* dst, int len, @@ -394,7 +408,7 @@ __m256i RequantizeAvx2( __m256 multiplier, __m256i zp) { static_assert( - std::is_same::value || std::is_same::value, + std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); @@ -1323,5 +1337,5 @@ Vectorized inline maximum(const Vectorized& a, const V return a.maximum(b); } -#endif // if defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) +#endif // if defined(CPU_CAPABILITY_AVX2) }} // namespace at::vec::CPU_CAPABILITY diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h index e48dd542265fb..2d8afd9ef2952 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_bfloat16_vsx.h @@ -51,6 +51,23 @@ inline void load_fp32_from_bf16( load_fp32_from_bf16(data, out2); } +inline void load_fp32_from_fp16(const c10::Half* data, Vectorized& out) { + __at_align__ float values[Vectorized::size()]; + for (const auto k : c10::irange(Vectorized::size())) { + values[k] = data[k]; + } + out = Vectorized::loadu(values); +} + +inline void load_fp32_from_fp16( + const c10::Half* data, + Vectorized& out1, + Vectorized& out2) { + load_fp32_from_fp16(data, out1); + data += Vectorized::size(); + load_fp32_from_fp16(data, out2); +} + } // namespace } // namespace vec } // namespace at diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h index 05b2f6499261b..9f4d38c920f7b 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_double_vsx.h @@ -212,12 +212,19 @@ class Vectorized { static Vectorized el_mergee( Vectorized& first, Vectorized& second) { - // as mergee phased in , we can use vec_perm with mask return { vec_mergeh(first._vec0, second._vec0), vec_mergeh(first._vec1, second._vec1)}; } + static Vectorized el_mergeo( + Vectorized& first, + Vectorized& second) { + return { + vec_mergel(first._vec0, second._vec0), + vec_mergel(first._vec1, second._vec1)}; + } + Vectorized abs_2_() const { auto a = (*this).elwise_mult(*this); auto permuted = a.el_swapped(); @@ -385,13 +392,11 @@ class Vectorized { static Vectorized horizontal_add( Vectorized& first, Vectorized& second) { - auto first_perm = first.el_swapped(); // 2perm - auto second_perm = second.el_swapped(); // 2perm - // summ - auto first_ret = first + first_perm; // 2add - auto second_ret = second + second_perm; // 2 add - // now lets choose evens - return el_mergee(first_ret, second_ret); // 2 mergee's + // Operates on individual floats, see _mm_hadd_ps + // {f0+f1, s0+s1, f2+f3, s2+s3, ...} + // i.e. it sums the re and im of each value and interleaves first and second: + // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} + return el_mergee(first, second) + el_mergeo(first, second); } static Vectorized horizontal_sub( @@ -432,25 +437,20 @@ class Vectorized { // re + im*i = (a + bi) / (c + di) // re = (ac + bd)/abs_2() // im = (bc - ad)/abs_2() -#if 1 - auto vi = b.el_mergeo(); - auto vr = b.el_mergee(); - auto abs_b = b.abs_2_(); - vi = vi ^ vd_isign_mask; - auto ret = elwise_mult(vr); - auto vx_swapped = el_swapped(); - ret = vx_swapped.el_madd(vi, ret); - ret = ret.elwise_div(abs_b); -#else - // Vectorized x86 simulation - auto ac_bd = elwise_mult(b); - auto d_c = b.el_swapped(); - d_c = d_c ^ vd_rsign_mask; - auto ad_bc = elwise_mult(d_c); - auto abs_b = b.abs_2_(); - auto re_im = horizontal_add(ac_bd, ad_bc); - auto ret = re_im.elwise_div(abs_b); -#endif + auto fabs_cd = Vectorized{ + vec_andc(b._vec0, vd_sign_mask), + vec_andc(b._vec1, vd_sign_mask)}; // |c| |d| + auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + auto a2 = elwise_div(scale); // a/sc b/sc + auto b2 = b.elwise_div(scale); // c/sc d/sc + auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/sc^2 + auto dc2 = b2.el_swapped(); // d/sc c/sc + dc2 = dc2 ^ vd_rsign_mask; // -d/sc c/sc + auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 + ret = ret.elwise_div(denom2); return ret; } @@ -511,13 +511,14 @@ class Vectorized { DEFINE_MEMBER_OP(operator&, ComplexDbl, vec_and) DEFINE_MEMBER_OP(operator|, ComplexDbl, vec_or) DEFINE_MEMBER_OP(operator^, ComplexDbl, vec_xor) - // elelemtwise helpers + // elementwise helpers DEFINE_MEMBER_OP(elwise_mult, ComplexDbl, vec_mul) DEFINE_MEMBER_OP(elwise_div, ComplexDbl, vec_div) DEFINE_MEMBER_OP(elwise_gt, ComplexDbl, vec_cmpgt) DEFINE_MEMBER_OP(elwise_ge, ComplexDbl, vec_cmpge) DEFINE_MEMBER_OP(elwise_lt, ComplexDbl, vec_cmplt) DEFINE_MEMBER_OP(elwise_le, ComplexDbl, vec_cmple) + DEFINE_MEMBER_OP(elwise_max, ComplexDbl, vec_max) }; template <> diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h index 91bf616db4bc4..53e80523f761a 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_complex_float_vsx.h @@ -238,18 +238,14 @@ class Vectorized { return loadu(tmp); } - static Vectorized horizontal_add_permD8( + static Vectorized horizontal_add( Vectorized& first, Vectorized& second) { - // we will simulate it differently with 6 instructions total - // lets permute second so that we can add it getting horizontal sums - auto first_perm = first.el_swapped(); // 2perm - auto second_perm = second.el_swapped(); // 2perm - // sum - auto first_ret = first + first_perm; // 2add - auto second_ret = second + second_perm; // 2 add - // now lets choose evens - return el_mergee(first_ret, second_ret); // 2 mergee's + // Operates on individual floats, see _mm_hadd_ps + // {f0+f1, s0+s1, f2+f3, s2+s3, ...} + // i.e. it sums the re and im of each value and interleaves first and second: + // {f_re0 + f_im0, s_re0 + s_im0, f_re1 + f_im1, s_re1 + s_im1, ...} + return el_mergee(first, second) + el_mergeo(first, second); } static Vectorized horizontal_sub_permD8( @@ -353,12 +349,19 @@ class Vectorized { static Vectorized el_mergee( Vectorized& first, Vectorized& second) { - // as mergee phased in , we can use vec_perm with mask return { vec_mergee(first._vecb0, second._vecb0), vec_mergee(first._vecb1, second._vecb1)}; } + static Vectorized el_mergeo( + Vectorized& first, + Vectorized& second) { + return { + vec_mergeo(first._vecb0, second._vecb0), + vec_mergeo(first._vecb1, second._vecb1)}; + } + Vectorized angle_() const { // angle = atan2(b/a) // auto b_a = _mm256_permute_ps(values, 0xB1); // b a @@ -488,25 +491,20 @@ class Vectorized { // re + im*i = (a + bi) / (c + di) // re = (ac + bd)/abs_2() // im = (bc - ad)/abs_2() -#if 1 - auto vi = b.el_mergeo(); - auto vr = b.el_mergee(); - auto abs_b = b.abs_2_(); - vi = vi ^ isign_mask; - auto ret = elwise_mult(vr); - auto vx_swapped = el_swapped(); - ret = vx_swapped.el_madd(vi, ret); - ret = ret.elwise_div(abs_b); -#else - // Vectorized x86 simulation - auto ac_bd = elwise_mult(b); - auto d_c = b.el_swapped(); - d_c = d_c ^ rsign_mask; - auto ad_bc = elwise_mult(d_c); - auto abs_b = b.abs_2_(); - auto re_im = horizontal_add_permD8(ac_bd, ad_bc); - auto ret = re_im.elwise_div(abs_b); -#endif + auto fabs_cd = Vectorized{ + vec_andc(b._vec0, sign_mask), + vec_andc(b._vec1, sign_mask)}; // |c| |d| + auto fabs_dc = fabs_cd.el_swapped(); // |d| |c| + auto scale = fabs_cd.elwise_max(fabs_dc); // sc = max(|c|, |d|) + auto a2 = elwise_div(scale); // a/sc b/sc + auto b2 = b.elwise_div(scale); // c/sc d/sc + auto acbd2 = a2.elwise_mult(b2); // ac/sc^2 bd/sc^2 + auto dc2 = b2.el_swapped(); // d/sc c/sc + dc2 = dc2 ^ rsign_mask; // -d/sc c/sc + auto adbc2 = a2.elwise_mult(dc2); // -ad/sc^2 bc/sc^2 + auto ret = horizontal_add(acbd2, adbc2); // (ac+bd)/sc^2 (bc-ad)/sc^2 + auto denom2 = b2.abs_2_(); // (c^2+d^2)/sc^2 (c^2+d^2)/sc^2 + ret = ret.elwise_div(denom2); return ret; } @@ -589,6 +587,7 @@ class Vectorized { DEFINE_MEMBER_OP(elwise_ge, ComplexFlt, vec_cmpge) DEFINE_MEMBER_OP(elwise_lt, ComplexFlt, vec_cmplt) DEFINE_MEMBER_OP(elwise_le, ComplexFlt, vec_cmple) + DEFINE_MEMBER_OP(elwise_max, ComplexFlt, vec_max) }; template <> diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h index bfcf5d984987e..139044cbd4698 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_double_vsx.h @@ -383,6 +383,19 @@ class Vectorized { auto ret = (x == x); return ret._nor(); } + bool has_inf_nan() const { + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; + } DEFINE_MEMBER_OP(operator==, double, vec_cmpeq) DEFINE_MEMBER_OP(operator!=, double, vec_cmpne) diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h index d08fb54fd56ec..0003773e37c89 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_float_vsx.h @@ -234,9 +234,18 @@ class Vectorized { return ret._nor(); } - Vectorized _isinf() const { - auto x = *this; - return (x == v_inf) | (x == v_minus_inf); + bool has_inf_nan() const { + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; } int zero_mask() const { diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h index 26c90a371f903..e8d12eb938e54 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_qint8_vsx.h @@ -91,7 +91,7 @@ struct Vectorized { vec_vsx_ld(offset0, reinterpret_cast(ptr)), vec_vsx_ld(offset16, reinterpret_cast(ptr))}; } - __at_align__ value_type tmp_values[size()]; + __at_align__ value_type tmp_values[size()] = {}; std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; } diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h index c3a320af156de..93f80a14638e9 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vec256_quint8_vsx.h @@ -94,7 +94,7 @@ struct Vectorized { vec_vsx_ld(offset0, reinterpret_cast(ptr)), vec_vsx_ld(offset16, reinterpret_cast(ptr))}; } - __at_align__ value_type tmp_values[size()]; + __at_align__ value_type tmp_values[size()] = {}; std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(value_type)); return {vec_vsx_ld(offset0, tmp_values), vec_vsx_ld(offset16, tmp_values)}; } diff --git a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h index d5033990f60c7..1dc742f3cbb1c 100644 --- a/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h +++ b/aten/src/ATen/cpu/vec/vec256/vsx/vsx_helpers.h @@ -391,6 +391,7 @@ const vbool32 imag_mask = vbool32{0x0, 0xFFFFFFFF, 0x0, 0xFFFFFFFF}; const vbool32 isign_mask = vbool32{0x0, 0x80000000, 0x0, 0x80000000}; const vbool32 rsign_mask = vbool32{0x80000000, 0x0, 0x80000000, 0x0}; +const vbool64 vd_sign_mask = vbool64{0x8000000000000000, 0x8000000000000000}; const vbool64 vd_imag_mask = vbool64{0x0, 0xFFFFFFFFFFFFFFFF}; const vbool64 vd_real_mask = vbool64{0xFFFFFFFFFFFFFFFF, 0x0}; const vbool64 vd_isign_mask = vbool64{0x0, 0x8000000000000000}; diff --git a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h index e28c999983c0c..b70b494649b36 100644 --- a/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h +++ b/aten/src/ATen/cpu/vec/vec256/zarch/vec256_zarch.h @@ -13,8 +13,6 @@ #include #include -#define SLEEF_MEMORY_WORKAROUND - namespace at { namespace vec { @@ -393,40 +391,84 @@ struct Vectorized()>> { C10_ALWAYS_INLINE Vectorized(T s) : _vec0{vec_splats((ElementType)s)}, _vec1{vec_splats((ElementType)s)} {} - static Vectorized C10_ALWAYS_INLINE - loadu(const void* ptr, int count = size()) { - if (count == size()) { + template + struct LoaduHelper { + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + __at_align__ ElementType tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); + return { - vec_xl(offset0, reinterpret_cast(ptr)), - vec_xl(offset16, reinterpret_cast(ptr))}; + vec_xl(offset0, &(tmp_values[0])), + vec_xl(offset16, &(tmp_values[0]))}; } + }; + + template + struct LoaduHelper { + static Vectorized C10_ALWAYS_INLINE + loadu(const ElementType* ptr, int count = size()) { + if (count == size()) { + return { + vec_xl(offset0, ptr), + vec_xl(offset16, ptr)}; + } - __at_align__ ElementType tmp_values[size()] = {}; - std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); + __at_align__ ElementType tmp_values[size()] = {}; + std::memcpy(tmp_values, ptr, std::min(count, size()) * sizeof(ElementType)); - return { - vec_xl(offset0, reinterpret_cast(tmp_values)), - vec_xl(offset16, reinterpret_cast(tmp_values))}; + return { + vec_xl(offset0, &(tmp_values[0])), + vec_xl(offset16, &(tmp_values[0]))}; + } + }; + + template + static Vectorized C10_ALWAYS_INLINE + loadu(const U* ptr, int count = size()) { + return LoaduHelper::loadu(ptr, count); } - static Vectorized C10_ALWAYS_INLINE - loadu_one_fourth(const void* ptr) { + template + static Vectorized C10_ALWAYS_INLINE + loadu_one_fourth(const U* ptr) { // load only first 8 bytes // only intended to be used with uint8_t return loadu(ptr, 8 / sizeof(ElementType)); } - void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { - if (count == size()) { - vec_xst(_vec0, offset0, reinterpret_cast(ptr)); - vec_xst(_vec1, offset16, reinterpret_cast(ptr)); - } else if (count > 0) { - __at_align__ ElementType tmp_values[size()]; - vec_xst(_vec0, offset0, reinterpret_cast(tmp_values)); - vec_xst(_vec1, offset16, reinterpret_cast(tmp_values)); - std::memcpy( - ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); + template + struct StoreHelper { + static void C10_ALWAYS_INLINE store(const Vectorized &vec, U* ptr, int count = size()) { + if (count > 0) { + __at_align__ ElementType tmp_values[size()]; + vec_xst(vec._vec0, offset0, &(tmp_values[0])); + vec_xst(vec._vec1, offset16, &(tmp_values[0])); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); + } + } + }; + + template + struct StoreHelper { + static void C10_ALWAYS_INLINE store(const Vectorized &vec, ElementType* ptr, int count = size()) { + if (count == size()) { + vec_xst(vec._vec0, offset0, ptr); + vec_xst(vec._vec1, offset16, ptr); + } else if (count > 0) { + __at_align__ ElementType tmp_values[size()]; + vec_xst(vec._vec0, offset0, &(tmp_values[0])); + vec_xst(vec._vec1, offset16, &(tmp_values[0])); + std::memcpy( + ptr, tmp_values, std::min(count, size()) * sizeof(ElementType)); + } } + }; + + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { + return StoreHelper::store(*this, ptr, count); } C10_ALWAYS_INLINE const vtype& vec0() const { @@ -875,6 +917,20 @@ struct Vectorized()>> { return ret._not(); } + bool has_inf_nan() const { + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec0[i]) || _isinf(_vec0[i])) { + return true; + } + } + for (const auto i : c10::irange(size()/2)) { + if(_isnan(_vec1[i]) || _isinf(_vec1[i])) { + return true; + } + } + return false; + } + template < typename U = T, std::enable_if_t::value, int> = 0> @@ -1090,32 +1146,20 @@ struct Vectorized()>> { } Vectorized sin() const { -#ifndef SLEEF_MEMORY_WORKAROUND return mapSleef(Sleef_sinf4_u10, Sleef_sind2_u10); -#else - return mapOrdinary(std::sin); -#endif } Vectorized sinh() const { return mapSleef(Sleef_sinhf4_u10, Sleef_sinhd2_u10); } Vectorized cos() const { -#ifndef SLEEF_MEMORY_WORKAROUND return mapSleef(Sleef_cosf4_u10, Sleef_cosd2_u10); -#else - return mapOrdinary(std::cos); -#endif } Vectorized cosh() const { return mapSleef(Sleef_coshf4_u10, Sleef_coshd2_u10); } Vectorized tan() const { -#ifndef SLEEF_MEMORY_WORKAROUND return mapSleef(Sleef_tanf4_u10, Sleef_tand2_u10); -#else - return mapOrdinary(std::tan); -#endif } Vectorized tanh() const { return mapSleef(Sleef_tanhf4_u10, Sleef_tanhd2_u10); @@ -1447,19 +1491,19 @@ inline ZSimdVect vec_flt_int(const ZSimdVect x) { #define vec_flt_int vec_signed #endif -Vectorized convert_to_float(const Vectorized& x) { +Vectorized zvec_convert_to_float(const Vectorized& x) { return {vec_int_flt(x.vec0()), vec_int_flt(x.vec1())}; } -Vectorized convert_to_int(const Vectorized& x) { +Vectorized zvec_convert_to_int(const Vectorized& x) { return {vec_flt_int(x.vec0()), vec_flt_int(x.vec1())}; } -Vectorized convert_to_float(const Vectorized& x) { +Vectorized zvec_convert_to_float(const Vectorized& x) { return {vec_double(x.vec0()), vec_double(x.vec1())}; } -Vectorized convert_to_int(const Vectorized& x) { +Vectorized zvec_convert_to_int(const Vectorized& x) { return {vec_signed(x.vec0()), vec_signed(x.vec1())}; } @@ -1517,13 +1561,13 @@ Vectorized C10_ALWAYS_INLINE fmadd( template <> Vectorized C10_ALWAYS_INLINE convert_to_int_of_same_size(const Vectorized& src) { - return convert_to_int(src); + return zvec_convert_to_int(src); } template <> Vectorized C10_ALWAYS_INLINE convert_to_int_of_same_size(const Vectorized& src) { - return convert_to_int(src); + return zvec_convert_to_int(src); } template <> @@ -1535,7 +1579,7 @@ inline void convert(const int32_t* src, float* dst, int64_t n) { const int32_t* src_a = src + i; float* dst_a = dst + i; auto input_vec = Vectorized::loadu(src_a); - auto output_vec = convert_to_float(input_vec); + auto output_vec = zvec_convert_to_float(input_vec); output_vec.store(dst_a); } @@ -1552,7 +1596,7 @@ inline void convert(const int64_t* src, double* dst, int64_t n) { const int64_t* src_a = src + i; double* dst_a = dst + i; auto input_vec = Vectorized::loadu(src_a); - auto output_vec = convert_to_float(input_vec); + auto output_vec = zvec_convert_to_float(input_vec); output_vec.store(dst_a); } for (; i < n; i++) { @@ -1696,12 +1740,14 @@ struct Vectorized()>> { return _vec; } + template static Vectorized C10_ALWAYS_INLINE - loadu(const void* ptr, int count = size()) { + loadu(const U* ptr, int count = size()) { return Vectorized{vinner_type::loadu(ptr, count)}; } - void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { _vec.store(ptr, count); } @@ -1729,7 +1775,7 @@ struct Vectorized()>> { Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { - auto float_val = convert_to_float(_vec); + auto float_val = zvec_convert_to_float(_vec); return {fmadd(scale, float_val, scale_zp_premul)}; } @@ -1739,7 +1785,7 @@ struct Vectorized()>> { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { - auto float_val = convert_to_float(_vec); + auto float_val = zvec_convert_to_float(_vec); return {(float_val - zero_point) * scale}; } @@ -1754,7 +1800,7 @@ struct Vectorized()>> { Vectorized vecf = rhs[0]; vecf = vecf * Vectorized(inverse_scale); vecf = vecf.rint() + Vectorized((float)(zero_point)); - auto veci = convert_to_int(vecf); + auto veci = zvec_convert_to_int(vecf); return Vectorized{veci}; } @@ -1767,10 +1813,10 @@ struct Vectorized()>> { float multiplier, int32_t zero_point) { Vectorized vi = inp[0]; - auto vecf = convert_to_float(vi.vec()); + auto vecf = zvec_convert_to_float(vi.vec()); vecf = vecf * Vectorized(multiplier); vecf = vecf.rint(); - auto veci = convert_to_int(vecf) + Vectorized(zero_point); + auto veci = zvec_convert_to_int(vecf) + Vectorized(zero_point); return Vectorized{veci}; } @@ -1805,11 +1851,11 @@ struct Vectorized()>> { auto ret32_0 = unpack(ret16.first); auto ret32_1 = unpack(ret16.second); - auto vecf_0 = convert_to_float(ret32_0.first); - auto vecf_1 = convert_to_float(ret32_0.second); + auto vecf_0 = zvec_convert_to_float(ret32_0.first); + auto vecf_1 = zvec_convert_to_float(ret32_0.second); - auto vecf_2 = convert_to_float(ret32_1.first); - auto vecf_3 = convert_to_float(ret32_1.second); + auto vecf_2 = zvec_convert_to_float(ret32_1.first); + auto vecf_3 = zvec_convert_to_float(ret32_1.second); return { fmadd(scale, vecf_0, scale_zp_premul), fmadd(scale, vecf_1, scale_zp_premul), @@ -1828,11 +1874,11 @@ struct Vectorized()>> { auto ret32_0 = unpack(ret16.first); auto ret32_1 = unpack(ret16.second); - auto vecf_0 = convert_to_float(ret32_0.first); - auto vecf_1 = convert_to_float(ret32_0.second); + auto vecf_0 = zvec_convert_to_float(ret32_0.first); + auto vecf_1 = zvec_convert_to_float(ret32_0.second); - auto vecf_2 = convert_to_float(ret32_1.first); - auto vecf_3 = convert_to_float(ret32_1.second); + auto vecf_2 = zvec_convert_to_float(ret32_1.first); + auto vecf_3 = zvec_convert_to_float(ret32_1.second); return { (vecf_0 - zero_point) * scale, @@ -1867,10 +1913,10 @@ struct Vectorized()>> { vecf4 = vecf4.rint() + vec_zero_point; vecf6 = vecf6.rint() + vec_zero_point; - auto veci0 = convert_to_int(vecf0); - auto veci2 = convert_to_int(vecf2); - auto veci4 = convert_to_int(vecf4); - auto veci6 = convert_to_int(vecf6); + auto veci0 = zvec_convert_to_int(vecf0); + auto veci2 = zvec_convert_to_int(vecf2); + auto veci4 = zvec_convert_to_int(vecf4); + auto veci6 = zvec_convert_to_int(vecf6); auto vecshi0 = pack(veci0, veci2); auto vecshi2 = pack(veci4, veci6); @@ -1894,11 +1940,11 @@ struct Vectorized()>> { Vectorized vi2 = inp[2]; Vectorized vi3 = inp[3]; - auto vecf0 = convert_to_float(vi0.vec()); - auto vecf2 = convert_to_float(vi1.vec()); + auto vecf0 = zvec_convert_to_float(vi0.vec()); + auto vecf2 = zvec_convert_to_float(vi1.vec()); - auto vecf4 = convert_to_float(vi2.vec()); - auto vecf6 = convert_to_float(vi3.vec()); + auto vecf4 = zvec_convert_to_float(vi2.vec()); + auto vecf6 = zvec_convert_to_float(vi3.vec()); vecf0 = vecf0 * vec_multiplier; vecf2 = vecf2 * vec_multiplier; @@ -1911,10 +1957,10 @@ struct Vectorized()>> { vecf4 = vecf4.rint(); vecf6 = vecf6.rint(); - auto veci0 = convert_to_int(vecf0); - auto veci2 = convert_to_int(vecf2); - auto veci4 = convert_to_int(vecf4); - auto veci6 = convert_to_int(vecf6); + auto veci0 = zvec_convert_to_int(vecf0); + auto veci2 = zvec_convert_to_int(vecf2); + auto veci4 = zvec_convert_to_int(vecf4); + auto veci6 = zvec_convert_to_int(vecf6); veci0 = veci0 + vec_zero_point; veci2 = veci2 + vec_zero_point; @@ -2160,12 +2206,14 @@ struct Vectorized()>> { return _vec.data(); } + template static Vectorized C10_ALWAYS_INLINE - loadu(const void* ptr, int count = size()) { + loadu(const U* ptr, int count = size()) { return Vectorized{vinner_type::loadu(ptr, 2 * count)}; } - void C10_ALWAYS_INLINE store(void* ptr, int count = size()) const { + template + void C10_ALWAYS_INLINE store(U* ptr, int count = size()) const { return _vec.store(ptr, 2 * count); } @@ -2776,19 +2824,23 @@ std::pair, Vectorized> inline deinterleave2< return inner_deinterleave2(a, b); } -inline Vectorized convert_uint8_to_float(const Vectorized &src) { +template +typename std::enable_if::value, at::vec::Vectorized>::type +inline convert_int8_to_float(const Vectorized &src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() // Only handle first 64 bits auto vec_int = src.to_vec_float_helper(); - return convert_to_float(vec_int); + return zvec_convert_to_float(vec_int); } -inline Vectorized convert_float_to_uint8(const Vectorized &src) { - constexpr auto min_val = std::numeric_limits::min(); - constexpr auto max_val = std::numeric_limits::max(); +template +typename std::enable_if::value, at::vec::Vectorized>::type +inline convert_float_to_int8(const Vectorized &src) { + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); - auto vec_int = clamp(convert_to_int(src), Vectorized(min_val), Vectorized(max_val)); + auto vec_int = clamp(zvec_convert_to_int(src), Vectorized(min_val), Vectorized(max_val)); return vec_int.to_vec_uint8_helper(); } diff --git a/aten/src/ATen/cpu/vec/vec512/vec512.h b/aten/src/ATen/cpu/vec/vec512/vec512.h index fd2e058b63ac6..c7fa23b23a607 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512.h @@ -13,6 +13,8 @@ #include #include #include +#include +#include #include #include @@ -55,7 +57,7 @@ std::ostream& operator<<(std::ostream& stream, const Vectorized& vec) { } -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CAST (AVX512) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -80,7 +82,8 @@ inline Vectorized cast(const Vectorized& src) } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. template std::enable_if_t> inline gather(const double* base_addr, const Vectorized& vindex) { @@ -92,9 +95,10 @@ std::enable_if_t& vindex) { return _mm512_i32gather_ps(vindex, base_addr, scale); } - +#endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ MASK GATHER ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - +#ifndef _MSC_VER +// MSVC is not working well on complex function overload. template std::enable_if_t> inline mask_gather(const Vectorized& src, const double* base_addr, @@ -112,7 +116,7 @@ inline mask_gather(const Vectorized& src, const float* base_addr, auto mask_ = _mm512_cmp_ps_mask(all_ones, mask.values, _CMP_EQ_OQ); return _mm512_mask_i32gather_ps(src, mask_, vindex, base_addr, scale); } - +#endif // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ CONVERT ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template<> @@ -127,6 +131,18 @@ inline convert_to_int_of_same_size(const Vectorized &src) { return _mm512_cvttps_epi32(src); } +template<> +Vectorized +inline convert_to_fp_of_same_size(const Vectorized &src) { + return _mm512_cvtepi64_pd(src); +} + +template<> +Vectorized +inline convert_to_fp_of_same_size(const Vectorized &src) { + return _mm512_cvtepi32_ps(src); +} + // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ INTERLEAVE ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ template <> @@ -258,6 +274,6 @@ inline Vectorized flip(const Vectorized & v) { return flip8(v); } -#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#endif // defined(CPU_CAPABILITY_AVX512) }}} diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h index acc074435a489..c7132349418de 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_bfloat16.h @@ -7,7 +7,8 @@ #include #include -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS #include #endif @@ -16,7 +17,18 @@ namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) + +#ifndef SLEEF_CONST +#if (defined(__GNUC__) || defined(__CLANG__)) && !defined(__INTEL_COMPILER) +#define SLEEF_CONST const +#else +#define SLEEF_CONST +#endif +#define SLEEF_CONST_OLD SLEEF_CONST +#else +#define SLEEF_CONST_OLD +#endif // bfloat16 conversion static inline void cvtbf16_fp32(const __m256i& a, __m512& o) { @@ -30,6 +42,25 @@ static inline void cvtbf16_fp32(const __m512i& a, __m512& o1, __m512& o2) { cvtbf16_fp32(hi, o2); } +static inline __m256i cvtfp32_bf16(const __m512& src) { + __m512i value = _mm512_castps_si512(src); + __m512i nan = _mm512_set1_epi32(0xffff); + auto mask_value = _mm512_cmp_ps_mask(src, src, _CMP_ORD_Q); + __m512i ones = _mm512_set1_epi32(0x1); + __m512i vec_bias = _mm512_set1_epi32(0x7fff); + // uint32_t lsb = (input >> 16) & 1; + auto t_value = _mm512_and_si512(_mm512_srli_epi32(value, 16), ones); + // uint32_t rounding_bias = 0x7fff + lsb; + t_value = _mm512_add_epi32(t_value, vec_bias); + // input += rounding_bias; + t_value = _mm512_add_epi32(t_value, value); + // input = input >> 16; + t_value = _mm512_srli_epi32(t_value, 16); + // Check NaN before converting back to bf16 + t_value = _mm512_mask_blend_epi32(mask_value, nan, t_value); + return _mm512_cvtusepi32_epi16(t_value); +} + static inline __m512i cvtfp32_bf16(const __m512& a, const __m512& b) { __m512i lo = _mm512_castps_si512(a); __m512i hi = _mm512_castps_si512(b); @@ -81,6 +112,11 @@ static inline void cvtfp16_fp32(const __m512i& a, __m512& o1, __m512& o2) { cvtfp16_fp32(hi, o2); } +static inline __m256i cvtfp32_fp16(const __m512& src) { + return _mm512_cvtps_ph( + src, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); +} + static inline __m512i cvtfp32_fp16(const __m512& a, const __m512& b) { __m256i lo = _mm512_cvtps_ph( a, (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC)); @@ -343,7 +379,8 @@ static_assert( } #pragma clang diagnostic push #pragma clang diagnostic ignored "-Wignored-qualifiers" - Vectorized map(const __m512 (*const vop)(__m512)) const { + + Vectorized map(SLEEF_CONST __m512 (*SLEEF_CONST_OLD vop)(__m512)) const { __m512 lo, hi; cvt_to_fp32(values, lo, hi); const auto o1 = vop(lo); @@ -400,6 +437,9 @@ static_assert( Vectorized acos() const { return map(Sleef_acosf16_u10); } + Vectorized acosh() const { + return map(Sleef_acoshf16_u10); + } Vectorized asin() const { return map(Sleef_asinf16_u10); } @@ -936,6 +976,395 @@ Vectorized inline fmadd(const Vectorized& a, return cvtfp32_bf16(o1, o2); } +static inline void _transpose_mxn_half_16_16(__m256i t[], __m512i u[]) { + __m512i r[8]; + // a0a1 a2a3 a4a5 a6a7 a8a9 a10a11 a12a13 a14a15 e0e1 e2e3 e4e5 e6e7 e8e9 e10e11 e12e13 e14e15 + // b0-b15 f0-f15 + // c0-c15 g0-g15 + // d0-d15 h0-h15 + // i0-i15 m0-m15 + // j0-j15 n0-n15 + // k0-k15 o0-o15 + // l0-l15 p0-p15 +#pragma unroll(4) + for (int i = 0; i < 4; i++) { + r[i] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i]), t[i + 4], 0x01); + r[i + 4] = _mm512_inserti64x4(_mm512_castsi256_si512(t[i + 8]), t[i + 12], 0x01); + } + + // u0: a0a1 b0b1 a2a3 b2b3 a8a9 b8b9 a10a11 b10b11 e0e1 f0f1 e2e3 f2f3 e8e9 f8f9 e10e11 f10f11 + // u1: a4a5 b4b5 a6a7 b6b7 a12a13 b12b13 a14a15 b14b15 e4e5 f4f5 e6e7 f6f7 e12e13 f12f13 e14e15 f14f15 + // u2: c0c1 d0d1 c2c3 d2d3 c8c9 d8d9 c10c11 d10d11 g0g1 h0h1 g2g3 h2h3 g8g9 h8h9 g10g11 h10h11 + // u3: c4c5 d4b5 c6c7 d6b7 c12c13 d12d13 c14c15 d14d15 g4g5 h4h5 g6g7 h6h7 g12g13 h12h13 g14g15 h14h15 + // i j m n + // k l o p +#pragma unroll(4) + for (int i = 0; i < 8; i += 2) { + u[i] = _mm512_unpacklo_epi32(r[i], r[i + 1]); + u[i + 1] = _mm512_unpackhi_epi32(r[i], r[i + 1]); + } + + // r0: a0a1 b0b1 c0c1 d0d1 a8a9 b8b9 c8c9 d8d9 e0e1 f0f1 g0g1 h0h1 e8e9 f8f9 g8g9 h8h9 + // r1: a2a3 b2b3 c2c3 d2d3 a10a11 b10b11 c10c11 d10d11 e2e3 f2f3 g2g3 h2h3 e10e11 f10f11 g10g11 h10h11 + // r2: a4a5 b4b5 c4c5 d4b5 a12a13 b12b13 c12c13 d12d13 + // r3: a6a7 b6b7 c6c7 d6b7 a14a15 b14b15 c14c15 d14d15 + // r4: i j k l m n o p + r[0] = _mm512_unpacklo_epi64(u[0], u[2]); + r[1] = _mm512_unpackhi_epi64(u[0], u[2]); + r[2] = _mm512_unpacklo_epi64(u[1], u[3]); + r[3] = _mm512_unpackhi_epi64(u[1], u[3]); + r[4] = _mm512_unpacklo_epi64(u[4], u[6]); + r[5] = _mm512_unpackhi_epi64(u[4], u[6]); + r[6] = _mm512_unpacklo_epi64(u[5], u[7]); + r[7] = _mm512_unpackhi_epi64(u[5], u[7]); + + __m512i const1 = _mm512_set_epi32( + 0x00370035, + 0x00330031, + 0x00270025, + 0x00230021, + 0x00170015, + 0x00130011, + 0x00070005, + 0x00030001, + 0x00360034, + 0x00320030, + 0x00260024, + 0x00220020, + 0x00160014, + 0x00120010, + 0x00060004, + 0x00020000); + __m512i const2 = _mm512_set_epi32( + 0x003f003d, + 0x003b0039, + 0x002f002d, + 0x002b0029, + 0x001f001d, + 0x001b0019, + 0x000f000d, + 0x000b0009, + 0x003e003c, + 0x003a0038, + 0x002e002c, + 0x002a0028, + 0x001e001c, + 0x001a0018, + 0x000e000c, + 0x000a0008); + // merge values from two regs + // 0-- 1-- + // 8-- 9-- + // 2-- 3-- + // 10-- 11-- + // 4-- 5-- + // 12-- 13-- + // 6-- 7-- + // 14-- 15-- +#pragma unroll(4) + for (int i = 0; i < 4; i++) { + u[i] = _mm512_permutex2var_epi16(r[i], const1, r[i + 4]); + u[i + 4] = _mm512_permutex2var_epi16(r[i], const2, r[i + 4]); + } +} + +// TODO(Leslie): Add the AVX2 Version of transpose_mxn for BFloat16 and Float16 +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +template<> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst) { + __m256i t[16]; + // load from src to registers + // a: a0 a1 a2 a3 a4 a5 a6 a7 a8 a9 a10 a11 a12 a13 a14 a15 + // b: b0 b1 b2 b3 b4 b5 b6 b7 b8 b9 b10 b11 b12 b13 b14 b15 + // c: c0 c1 c2 c3 c4 c5 c6 c7 c8 c9 c10 c11 c12 c13 c14 c15 + // d: d0 d1 d2 d3 d4 d5 d6 d7 d8 d9 d10 d11 d12 d13 d14 d15 + // e: e0 e1 e2 e3 e4 e5 e6 e7 e8 e9 e10 e11 e12 e13 e14 e15 + // f: f0 f1 f2 f3 f4 f5 f6 f7 f8 f9 f10 f11 f12 f13 f14 f15 + // g: g0 g1 g2 g3 g4 g5 g6 g7 g8 g9 g10 g11 g12 g13 g14 g15 + // h: h0 h1 h2 h3 h4 h5 h6 h7 h8 h9 h10 h11 h12 h13 h14 h15 + // i: i0 i1 i2 i3 i4 i5 i6 i7 i8 i9 i10 i11 i12 i13 i14 i15 + // j: j0 j1 j2 j3 j4 j5 j6 j7 j8 j9 j10 j11 j12 j13 j14 j15 + // k: k0 k1 k2 k3 k4 k5 k6 k7 k8 k9 k10 k11 k12 k13 k14 k15 + // l: l0 l1 l2 l3 l4 l5 l6 l7 l8 l9 l10 l11 l12 l13 l14 l15 + // m: m0 m1 m2 m3 m4 m5 m6 m7 m8 m9 m10 m11 m12 m13 m14 m15 + // n: n0 n1 n2 n3 n4 n5 n6 n7 n8 n9 n10 n11 n12 n13 n14 n15 + // o: o0 o1 o2 o3 o4 o5 o6 o7 o8 o9 o10 o11 o12 o13 o14 o15 + // p: p0 p1 p2 p3 p4 p5 p6 p7 p8 p9 p10 p11 p12 p13 p14 p15 +#pragma unroll(16) + for (int i = 0; i < 16; i++) { + t[i] = _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); + } + + __m512i u[8]; + _transpose_mxn_half_16_16(t, u); + +#pragma unroll(8) + for (int i = 0; i < 8; i++) { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x01)); + } +} + +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#L1483-L1607 +template<> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst) { + __m256i t[16]; + // load from src to registers + // Same matrix indices as above transpose_mxn +#pragma unroll(16) + for (int i = 0; i < 16; i++) { + t[i] = _mm256_loadu_si256(reinterpret_cast(src + i * ld_src)); + } + + __m512i u[8]; + _transpose_mxn_half_16_16(t, u); + +#pragma unroll(8) + for (int i = 0; i < 8; i++) { + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x0)); + _mm256_storeu_si256( + reinterpret_cast<__m256i*>(dst + (i * 2 + 1) * ld_dst), + _mm512_extracti32x8_epi32(u[i], 0x01)); + } +} + +static inline void _transpose_mxn_half_32_32(__m512i r[], __m512i d[]) { + // t[0]: 0 32 1 33 2 34 3 35 8 40 9 41 10 42 11 43 16 ... 59 + // t[1]: 4 36 5 37 6 38 7 39 12 44 13 45 14 46 15 47 20 ... 63 + // t[2]: 64 96 65 97 66 98 67 99 72 104 73 105 74 106 75 ... 123 + // t[3]: 68 100 69 101 70 102 71 103 76 108 77 109 78 110 79 111 84 ... 127 + // t[4]: 128 160 129 161 130 162 131 163 136 168 137 169 138 170 139 171 144 ... 187 + // t[5]: 132 164 133 165 134 166 135 167 140 172 141 173 142 174 143 175 148 ... 191 + // t[6]: 192 224 193 225 194 226 195 227 200 232 201 233 202 234 203 235 208 ... 251 + // t[7]: 196 228 197 229 198 230 199 231 204 236 205 237 206 238 207 239 212 ... 255 + // t[8]: 256 288 257 289 258 290 259 291 264 296 265 297 266 298 267 299 272 ... 315 + // t[9]: 260 292 261 293 262 294 263 295 268 300 269 301 270 302 271 303 276 ... 319 + // t[10]: 320 352 321 353 322 354 323 355 328 360 329 361 330 362 331 363 336 ... 379 + // t[11]: 324 356 325 357 326 358 327 359 332 364 333 365 334 366 335 367 340 ... 383 + // t[12]: 384 416 385 417 386 418 387 419 392 424 393 425 394 426 395 427 400 ... 443 + // t[13]: 388 420 389 421 390 422 391 423 396 428 397 429 398 430 399 431 404 ... 447 + // t[14]: 448 480 449 481 450 482 451 483 456 488 457 489 458 490 459 491 464 ... 507 + // t[15]: 452 484 453 485 454 486 455 487 460 492 461 493 462 494 463 495 468 ... 511 + // t[16]: 512 544 513 545 514 546 515 547 520 552 521 553 522 554 523 555 528 ... 571 + // ... + // t[31]: 964 996 965 997 966 998 967 999 972 1004 973 1005 974 1006 975 1007 980 ... 1023 +#pragma unroll(16) + for (int i = 0; i < 16; ++i) { + d[i * 2] = _mm512_unpacklo_epi16(r[i * 2], r[i * 2 + 1]); + d[i * 2 + 1] = _mm512_unpackhi_epi16(r[i * 2], r[i * 2 + 1]); + } + + // t[0]: 0 32 64 96 1 33 65 97 8 40 72 104 9 41 73 105 16 ... 121 + // t[1]: 2 34 66 98 3 35 67 99 10 42 74 106 11 43 75 107 18 ... 123 + // t[2]: 4 36 68 100 5 37 69 101 12 44 76 108 13 45 77 109 20 ... 125 + // t[3]: 6 38 70 102 7 39 71 103 14 46 78 110 15 47 79 111 22 ... 127 + // t[4]: 128 160 192 224 129 161 193 225 136 168 200 232 137 169 201 233 144 ... 249 + // t[5]: 130 162 194 226 131 163 195 227 138 170 202 234 139 171 203 235 146 ... 251 + // t[6]: 132 164 196 228 133 165 197 229 140 172 204 236 141 173 205 237 148 ... 253 + // t[7]: 134 166 198 230 135 167 199 231 142 174 206 238 143 175 207 239 150 ... 255 + // t[8]: 256 288 320 352 257 289 321 353 264 296 328 360 265 297 329 361 272 ... 377 + // t[9]: 258 290 322 354 259 291 323 355 266 298 330 362 267 299 331 363 274 ... 379 + // t[10]: 260 292 324 356 261 293 325 357 268 300 332 364 269 301 333 365 276 ... 381 + // t[11]: 262 294 326 358 263 295 327 359 270 302 334 366 271 303 335 367 278 ... 383 + // t[12]: 384 416 448 480 385 417 449 481 392 424 456 488 393 425 457 489 400 ... 505 + // t[13]: 386 418 450 482 387 419 451 483 394 426 458 490 395 427 459 491 402 ... 507 + // t[14]: 388 420 452 484 389 421 453 485 396 428 460 492 397 429 461 493 404 ... 509 + // t[15]: 390 422 454 486 391 423 455 487 398 430 462 494 399 431 463 495 406 ... 511 + // t[16]: 512 544 576 608 513 545 577 609 520 552 584 616 521 553 585 617 528 ... 633 + // ... + // t[31]: 902 934 966 998 903 935 967 999 910 942 974 1006 911 943 975 1007 918 ... 1023 +#pragma unroll(8) + for (int i = 0; i < 8; ++i) { + r[i * 4] = _mm512_unpacklo_epi32(d[i * 4], d[i * 4 + 2]); + r[i * 4 + 1] = _mm512_unpackhi_epi32(d[i * 4], d[i * 4 + 2]); + r[i * 4 + 2] = _mm512_unpacklo_epi32(d[i * 4 + 1], d[i * 4 + 3]); + r[i * 4 + 3] = _mm512_unpackhi_epi32(d[i * 4 + 1], d[i * 4 + 3]); + } + + // t[0]: 0 32 64 96 128 160 192 224 8 40 72 104 136 168 200 232 16 ... 248 + // t[1]: 1 33 65 97 129 161 193 225 9 41 73 105 137 169 201 233 17 ... 249 + // t[2]: 2 34 66 98 130 162 194 226 10 42 74 106 138 170 202 234 18 ... 250 + // t[3]: 3 35 67 99 131 163 195 227 11 43 75 107 139 171 203 235 19 ... 251 + // t[4]: 4 36 68 100 132 164 196 228 12 44 76 108 140 172 204 236 20 ... 252 + // t[5]: 5 37 69 101 133 165 197 229 13 45 77 109 141 173 205 237 21 ... 253 + // t[6]: 6 38 70 102 134 166 198 230 14 46 78 110 142 174 206 238 22 ... 254 + // t[7]: 7 39 71 103 135 167 199 231 15 47 79 111 143 175 207 239 23 ... 255 + // t[8]: 256 288 320 352 384 416 448 480 264 296 328 360 392 424 456 488 272 ... 504 + // t[9]: 257 289 321 353 385 417 449 481 265 297 329 361 393 425 457 489 273 ... 505 + // t[10]: 258 290 322 354 386 418 450 482 266 298 330 362 394 426 458 490 274 ... 506 + // t[11]: 259 291 323 355 387 419 451 483 267 299 331 363 395 427 459 491 275 ... 507 + // t[12]: 260 292 324 356 388 420 452 484 268 300 332 364 396 428 460 492 276 ... 508 + // t[13]: 261 293 325 357 389 421 453 485 269 301 333 365 397 429 461 493 277 ... 509 + // t[14]: 262 294 326 358 390 422 454 486 270 302 334 366 398 430 462 494 278 ... 510 + // t[15]: 263 295 327 359 391 423 455 487 271 303 335 367 399 431 463 495 279 ... 511 + // t[16]: 512 544 576 608 640 672 704 736 520 552 584 616 648 680 712 744 528 ... 760 + // ... + // t[31]: 775 807 839 871 903 935 967 999 783 815 847 879 911 943 975 1007 791 ... 1023 +#pragma unroll(4) + for (int i = 0; i < 4; ++i) { + d[i * 8] = _mm512_unpacklo_epi64(r[i * 8], r[i * 8 + 4]); + d[i * 8 + 1] = _mm512_unpackhi_epi64(r[i * 8], r[i * 8 + 4]); + d[i * 8 + 2] = _mm512_unpacklo_epi64(r[i * 8 + 1], r[i * 8 + 5]); + d[i * 8 + 3] = _mm512_unpackhi_epi64(r[i * 8 + 1], r[i * 8 + 5]); + d[i * 8 + 4] = _mm512_unpacklo_epi64(r[i * 8 + 2], r[i * 8 + 6]); + d[i * 8 + 5] = _mm512_unpackhi_epi64(r[i * 8 + 2], r[i * 8 + 6]); + d[i * 8 + 6] = _mm512_unpacklo_epi64(r[i * 8 + 3], r[i * 8 + 7]); + d[i * 8 + 7] = _mm512_unpackhi_epi64(r[i * 8 + 3], r[i * 8 + 7]); + } + + // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 16 ... 496 + // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 17 ... 497 + // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 18 ... 498 + // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 19 ... 499 + // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 20 ... 500 + // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 21 ... 501 + // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 22 ... 502 + // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 23 ... 503 + // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 24 ... 504 + // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 25 ... 505 + // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 26 ... 506 + // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 27 ... 507 + // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 28 ... 508 + // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 29 ... 509 + // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 30 ... 510 + // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 31 ... 511 + // t[16]: 512 544 576 608 640 672 704 736 768 800 832 864 896 928 960 992 528 ... 1008 + // ... + // t[31]: 527 559 591 623 655 687 719 751 783 815 847 879 911 943 975 1007 543 ... 1023 + __m512i const1 = _mm512_set_epi64( + 0x000000000000000d, + 0x000000000000000c, + 0x0000000000000005, + 0x0000000000000004, + 0x0000000000000009, + 0x0000000000000008, + 0x0000000000000001, + 0x0000000000000000); + __m512i const2 = _mm512_set_epi64( + 0x000000000000000f, + 0x000000000000000e, + 0x0000000000000007, + 0x0000000000000006, + 0x000000000000000b, + 0x000000000000000a, + 0x0000000000000003, + 0x0000000000000002); +#pragma unroll(8) + for (int i = 0; i < 8; ++i) { + r[i] = _mm512_permutex2var_epi64(d[i], /*idx*/const1, d[i + 8]); + r[i + 8] = _mm512_permutex2var_epi64(d[i], /*idx*/const2, d[i + 8]); + r[i + 16] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const1, d[i + 24]); + r[i + 24] = _mm512_permutex2var_epi64(d[i + 16], /*idx*/const2, d[i + 24]); + } + + // t[0]: 0 32 64 96 128 160 192 224 256 288 320 352 384 416 448 480 512 544 ... 992 + // t[1]: 1 33 65 97 129 161 193 225 257 289 321 353 385 417 449 481 513 545 ... 993 + // t[2]: 2 34 66 98 130 162 194 226 258 290 322 354 386 418 450 482 514 546 ... 994 + // t[3]: 3 35 67 99 131 163 195 227 259 291 323 355 387 419 451 483 515 547 ... 995 + // t[4]: 4 36 68 100 132 164 196 228 260 292 324 356 388 420 452 484 516 548 ... 996 + // t[5]: 5 37 69 101 133 165 197 229 261 293 325 357 389 421 453 485 517 549 ... 997 + // t[6]: 6 38 70 102 134 166 198 230 262 294 326 358 390 422 454 486 518 550 ... 998 + // t[7]: 7 39 71 103 135 167 199 231 263 295 327 359 391 423 455 487 519 551 ... 999 + // t[8]: 8 40 72 104 136 168 200 232 264 296 328 360 392 424 456 488 520 552 ... 1000 + // t[9]: 9 41 73 105 137 169 201 233 265 297 329 361 393 425 457 489 521 553 ... 1001 + // t[10]: 10 42 74 106 138 170 202 234 266 298 330 362 394 426 458 490 522 554 ... 1002 + // t[11]: 11 43 75 107 139 171 203 235 267 299 331 363 395 427 459 491 523 555 ... 1003 + // t[12]: 12 44 76 108 140 172 204 236 268 300 332 364 396 428 460 492 524 556 ... 1004 + // t[13]: 13 45 77 109 141 173 205 237 269 301 333 365 397 429 461 493 525 557 ... 1005 + // t[14]: 14 46 78 110 142 174 206 238 270 302 334 366 398 430 462 494 526 558 ... 1006 + // t[15]: 15 47 79 111 143 175 207 239 271 303 335 367 399 431 463 495 527 559 ... 1007 + // t[16]: 16 48 80 112 144 176 208 240 272 304 336 368 400 432 464 496 528 560 ... 1008 + // ... + // t[31]: 31 63 95 127 159 191 223 255 287 319 351 383 415 447 479 511 543 575 ... 1023 + __m512i const3 = _mm512_set_epi64( + 0x000000000000000b, + 0x000000000000000a, + 0x0000000000000009, + 0x0000000000000008, + 0x0000000000000003, + 0x0000000000000002, + 0x0000000000000001, + 0x0000000000000000); + __m512i const4 = _mm512_set_epi64( + 0x000000000000000f, + 0x000000000000000e, + 0x000000000000000d, + 0x000000000000000c, + 0x0000000000000007, + 0x0000000000000006, + 0x0000000000000005, + 0x0000000000000004); +#pragma unroll(16) + for (int i = 0; i < 16; ++i) { + d[i] = _mm512_permutex2var_epi64(r[i], /*idx*/const3, r[i + 16]); + d[i + 16] = _mm512_permutex2var_epi64(r[i], /*idx*/const4, r[i + 16]); + } +} + +// Code referred to FBGEMM: +// https://github.com/pytorch/FBGEMM/blob/39a423e4ad1a04b77fea81c7d09c3e6f8984fae9/src/UtilsAvx512.cc#LL19C6-L19C6 +template<> +inline void transpose_mxn( + const BFloat16* src, + int64_t ld_src, + BFloat16* dst, + int64_t ld_dst) { + // Load from memory + __m512i r[32]; +#pragma unroll(32) + for (int i = 0; i < 32; ++i) { + r[i] = _mm512_loadu_si512(reinterpret_cast(src + i* ld_src)); + } + + __m512i d[32]; + _transpose_mxn_half_32_32(r, d); + + // Store to dst +#pragma unroll(32) + for (int i = 0; i < 32; ++i) { + _mm512_storeu_si512(dst + i* ld_dst, d[i]); + } +} + +template<> +inline void transpose_mxn( + const Half* src, + int64_t ld_src, + Half* dst, + int64_t ld_dst) { + // Load from memory + __m512i r[32]; +#pragma unroll(32) + for (int i = 0; i < 32; ++i) { + r[i] = _mm512_loadu_si512(reinterpret_cast(src + i* ld_src)); + } + + __m512i d[32]; + _transpose_mxn_half_32_32(r, d); + + // Store to dst +#pragma unroll(32) + for (int i = 0; i < 32; ++i) { + _mm512_storeu_si512(dst + i* ld_dst, d[i]); + } +} + template <> class Vectorized: public Vectorized16 { public: @@ -1160,7 +1589,7 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V CONVERT_VECTORIZED_INIT(BFloat16, bfloat16); CONVERT_VECTORIZED_INIT(Half, half); -#else //defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#else //defined(CPU_CAPABILITY_AVX512) #define CONVERT_NON_VECTORIZED_INIT(type, name) \ inline std::tuple, Vectorized> convert_##name##_float(const Vectorized& a) { \ @@ -1190,9 +1619,9 @@ inline Vectorized convert_float_##name(const Vectorized& a, const V CONVERT_NON_VECTORIZED_INIT(BFloat16, bfloat16); CONVERT_NON_VECTORIZED_INIT(Half, half); -#endif // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#endif // defined(CPU_CAPABILITY_AVX512) -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) #define LOAD_FP32_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ auto values = _mm256_loadu_si256(reinterpret_cast(data)); \ @@ -1211,7 +1640,7 @@ inline void load_fp32_from_##name(const type *data, Vectorized& out1, Vec LOAD_FP32_VECTORIZED_INIT(BFloat16, bf16); LOAD_FP32_VECTORIZED_INIT(Half, fp16); -#else // defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#else // defined(CPU_CAPABILITY_AVX512) #define LOAD_FP32_NON_VECTORIZED_INIT(type, name) \ inline void load_fp32_from_##name(const type *data, Vectorized& out) { \ __at_align__ float values[Vectorized::size()]; \ diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h index 02aa3a87cc130..c35204f9da23e 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_double.h @@ -7,7 +7,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS #include #endif @@ -16,7 +17,7 @@ namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) template <> class Vectorized> { private: @@ -203,7 +204,7 @@ template <> class Vectorized> { auto abs = abs_(); auto zero = _mm512_setzero_pd(); auto mask = _mm512_cmp_pd_mask(abs, zero, _CMP_EQ_OQ); - auto div = values / abs; + auto div = _mm512_div_pd(values, abs); return _mm512_mask_blend_pd(mask, div, zero); } __m512d real_() const { diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h index a5d790c98b2f2..2801e484d94ce 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_complex_float.h @@ -7,7 +7,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS #include #endif @@ -16,7 +17,7 @@ namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) template <> class Vectorized> { private: @@ -708,7 +709,7 @@ template <> class Vectorized> { auto abs = abs_(); auto zero = _mm512_setzero_ps(); auto mask = _mm512_cmp_ps_mask(abs, zero, _CMP_EQ_OQ); - auto div = values / abs; + auto div = _mm512_div_ps(values, abs); return _mm512_mask_blend_ps(mask, div, zero); } __m512 real_() const { diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_convert.h b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h new file mode 100644 index 0000000000000..e8ad662a99fc2 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec512/vec512_convert.h @@ -0,0 +1,181 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + __m512 value; + cvtbf16_fp32(_mm512_castsi512_si256(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + __m512 value; + cvtfp16_fp32(_mm512_castsi512_si256(src[0]), value); + result[0] = value; + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + VectorizedN result; + result[0] = _mm512_castsi256_si512(cvtfp32_bf16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN result; + result[0] = _mm512_castsi256_si512(cvtfp32_fp16(src[0])); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm512_cvtepi64_ps(src[0]); + auto high = _mm512_cvtepi64_ps(src[1]); + return Vectorized( + _mm512_insertf32x8(_mm512_castps256_ps512(low), high, 1)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm512_cvt_roundps_epi64( + _mm512_castps512_ps256(src[0]), _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + result[1] = _mm512_cvt_roundps_epi64( + _mm512_extractf32x8_ps(src[0], 1), + _MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto low = _mm512_cvtepi64_epi32(src[0]); + auto high = _mm512_cvtepi64_epi32(src[1]); + return Vectorized( + _mm512_inserti32x8(_mm512_castsi256_si512(low), high, 1)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + at::vec::VectorizedN result; + result[0] = _mm512_cvtepi32_epi64(_mm512_castsi512_si256(src[0])); + result[1] = _mm512_cvtepi32_epi64(_mm512_extracti32x8_epi32(src[0], 1)); + return result; + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm512_castsi512_si128(src[0]); + return Vectorized(_mm512_cvtepi8_epi32(src128)); + } +}; + +template <> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + auto src128 = _mm512_castsi512_si128(src[0]); + return Vectorized(_mm512_cvtepu8_epi32(src128)); + } +}; + +template +struct VecConvert< + dst_t, + 1, + src_t, + 1, + typename std::enable_if_t< + (is_reduced_floating_point_v && is_8bit_integer_v) || + (is_reduced_floating_point_v && is_8bit_integer_v), + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + VectorizedN tmp_fp32 = VecConvert::apply(src); + return VecConvert::apply(tmp_fp32); + } +}; + +template +struct VecConvert< + dst_t, + 1, + float, + 1, + typename std::enable_if_t, + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_float_to_int8(src[0]); + } +}; + +template +struct VecConvert< + float, + 1, + src_t, + 1, + typename std::enable_if_t, + void>> { + static inline VectorizedN apply(const VectorizedN& src) { + return convert_int8_to_float(src[0]); + } +}; + +template +struct VecConvert< + dst_t, + 1, + int64_t, + 2, + typename std::enable_if< + std::is_same_v || + std::is_same_v>::type> { + static inline VectorizedN apply( + const VectorizedN& src) { + return VecConvert::apply( + VecConvert::apply(src)); + } +}; + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_double.h b/aten/src/ATen/cpu/vec/vec512/vec512_double.h index 2089e3a6c620b..508ab257e603b 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_double.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_double.h @@ -6,7 +6,8 @@ #include #include #include -#if (defined(CPU_CAPABILITY_AVX512)) && !defined(_MSC_VER) +#if (defined(CPU_CAPABILITY_AVX512)) +#define SLEEF_STATIC_LIBS #include #endif @@ -15,7 +16,7 @@ namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) template <> class Vectorized { private: @@ -106,6 +107,10 @@ template <> class Vectorized { return _mm512_castsi512_pd(_mm512_mask_set1_epi64(zero_vector, cmp_mask, 0xFFFFFFFFFFFFFFFF)); } + bool has_inf_nan() const { + __m512d self_sub = _mm512_sub_pd(values, values); + return (_mm512_movepi8_mask(_mm512_castpd_si512(self_sub)) & 0x7777777777777777) != 0; + } Vectorized map(double (*const f)(double)) const { __at_align__ double tmp[size()]; store(tmp); @@ -145,6 +150,9 @@ template <> class Vectorized { Vectorized acos() const { return Vectorized(Sleef_acosd8_u10(values)); } + Vectorized acosh() const { + return Vectorized(Sleef_acoshd8_u10(values)); + } Vectorized asin() const { return Vectorized(Sleef_asind8_u10(values)); } diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_float.h b/aten/src/ATen/cpu/vec/vec512/vec512_float.h index 633b5990a26b1..a08df3c141a38 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_float.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_float.h @@ -6,7 +6,8 @@ #include #include #include -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) +#define SLEEF_STATIC_LIBS #include #endif @@ -15,7 +16,7 @@ namespace vec { // See Note [CPU_CAPABILITY namespace] inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) template <> class Vectorized { private: @@ -125,6 +126,10 @@ template <> class Vectorized { return _mm512_castsi512_ps(_mm512_mask_set1_epi32(zero_vec, mask, 0xFFFFFFFF)); } + bool has_inf_nan() const { + __m512 self_sub = _mm512_sub_ps(values, values); + return (_mm512_movepi8_mask(_mm512_castps_si512(self_sub)) & 0x7777777777777777) != 0; + } Vectorized map(float (*const f)(float)) const { __at_align__ float tmp[size()]; store(tmp); @@ -164,6 +169,9 @@ template <> class Vectorized { Vectorized acos() const { return Vectorized(Sleef_acosf16_u10(values)); } + Vectorized acosh() const { + return Vectorized(Sleef_acoshf16_u10(values)); + } Vectorized asin() const { return Vectorized(Sleef_asinf16_u10(values)); } @@ -239,14 +247,14 @@ template <> class Vectorized { static __m512 vec_factorial_5 = _mm512_set1_ps(0.00828929059f); // 1/factorial(5) static __m512 vec_exp_log2ef = - (__m512)_mm512_set1_epi32(0x3fb8aa3b); // log2(e) + _mm512_castsi512_ps(_mm512_set1_epi32(0x3fb8aa3b)); // log2(e) static __m512 vec_half = _mm512_set1_ps(0.5f); static __m512 vec_one = _mm512_set1_ps(1.f); static __m512 vec_zero = _mm512_set1_ps(0.f); static __m512 vec_two = _mm512_set1_ps(2.f); - static __m512 vec_ln2f = (__m512)_mm512_set1_epi32(0x3f317218); // ln(2) - static __m512 vec_ln_flt_min = (__m512)_mm512_set1_epi32(0xc2aeac50); - static __m512 vec_ln_flt_max = (__m512)_mm512_set1_epi32(0x42b17218); + static __m512 vec_ln2f = _mm512_castsi512_ps(_mm512_set1_epi32(0x3f317218)); // ln(2) + static __m512 vec_ln_flt_min = _mm512_castsi512_ps(_mm512_set1_epi32(0xc2aeac50)); + static __m512 vec_ln_flt_max = _mm512_castsi512_ps(_mm512_set1_epi32(0x42b17218)); static __m512i vec_127 = _mm512_set1_epi32(0x0000007f); static int n_mantissa_bits = 23; @@ -281,7 +289,7 @@ template <> class Vectorized { auto vec_exp_number_i = _mm512_cvtps_epi32(vec_exp_number); auto vec_two_pow_n_i = _mm512_add_epi32(vec_exp_number_i, vec_127); vec_two_pow_n_i = _mm512_slli_epi32(vec_two_pow_n_i, n_mantissa_bits); - auto vec_two_pow_n = (__m512)vec_two_pow_n_i; + auto vec_two_pow_n = _mm512_castsi512_ps(vec_two_pow_n_i); vec_two_pow_n = _mm512_mask_blend_ps(less_ln_flt_min_mask, vec_two_pow_n, vec_zero); diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_int.h b/aten/src/ATen/cpu/vec/vec512/vec512_int.h index 2610d344380b3..1022221c81a19 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_int.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_int.h @@ -540,7 +540,7 @@ class Vectorized : public Vectorizedi { template class Vectorized8 : public Vectorizedi { static_assert( - std::is_same::value || std::is_same::value, + std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); protected: static constexpr __m512i zero_vector {0, 0, 0, 0, 0, 0, 0, 0}; @@ -1069,7 +1069,7 @@ Vectorized inline maximum(const Vectorized& a, const Vectorized< template <> Vectorized inline maximum(const Vectorized& a, const Vectorized& b) { - return _mm512_max_epi8(a, b); + return _mm512_max_epu8(a, b); } template <> @@ -1320,7 +1320,7 @@ inline Vectorized Vectorized::le(const Vectorized& ot return (*this <= other) & Vectorized(1); } -template ::value || std::is_same::value, int> = 0> +template || std::is_same_v, int> = 0> Vectorized inline shift_512_8(const Vectorized& a, const Vectorized& b) { // No vector instruction for shifting int8_t/uint8_t, so emulating // it instead. diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_mask.h b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h new file mode 100644 index 0000000000000..9ba1b18372eb5 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec512/vec512_mask.h @@ -0,0 +1,155 @@ +#pragma once + +#include +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +template +struct VecMaskLoad< + T, + 1, + mask_t, + 1, + typename std::enable_if_t< + std::is_same_v || std::is_same_v || + std::is_same_v, + void>> { + static inline VectorizedN apply( + const T* ptr, + const VecMask& vec_mask) { + at::vec::Vectorized zero_vec(0); + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + if constexpr (std::is_same_v) { + return Vectorized(_mm512_mask_loadu_ps(zero_vec, mmask, ptr)); + } else { + return Vectorized(_mm512_mask_loadu_epi32(zero_vec, mmask, ptr)); + } + } +}; + +template +struct VecMaskLoad< + data_t, + 1, + mask_t, + 1, + typename std::enable_if< + std::is_same_v || + std::is_same_v>::type> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + auto zero = _mm256_set1_epi16(0); + auto temp = _mm256_mask_loadu_epi16(zero, mmask, ptr); + return Vectorized( + _mm512_inserti32x8(_mm512_castsi256_si512(temp), zero, 1)); + } +}; + +template +struct VecMaskLoad< + data_t, + 1, + mask_t, + 1, + typename std::enable_if< + std::is_same_v || + std::is_same_v>::type> { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + auto zero = _mm_set1_epi8(0); + auto temp = _mm_mask_loadu_epi8(zero, mmask, ptr); + return Vectorized( + _mm512_inserti64x2(_mm512_set1_epi32(0), temp, 0)); + } +}; + +template +struct VecMaskLoad { + static inline VectorizedN apply( + const int64_t* ptr, + const VecMask& vec_mask) { + auto all_ones = _mm512_set1_epi32(0xFFFFFFFF); + auto zero = _mm512_set1_epi64(0); + auto int_mask = vec_mask.template cast()[0]; + auto mmask = _mm512_cmp_epi32_mask(int_mask, all_ones, _MM_CMPINT_EQ); + at::vec::VectorizedN result; + result[0] = _mm512_mask_loadu_epi64(zero, (__mmask8)mmask, ptr); + result[1] = _mm512_mask_loadu_epi64(zero, (__mmask8)(mmask >> 8), ptr + 8); + return result; + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return Vectorized(_mm512_castsi512_ps(vec_mask[0])); + } +}; + +template <> +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return Vectorized(_mm512_castps_si512(vec_mask[0])); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + auto int_vec = convert(VectorizedN(vec_mask)); + return VecMask(int_vec).cast(); + } +}; + +template <> +inline bool VecMask::all_zero() const { + __mmask16 mask = _mm512_test_epi32_mask(mask_[0], mask_[0]); + return mask == 0; +} + +template <> +inline bool VecMask::is_masked(int i) const { + return _mm512_movepi32_mask(mask_[0]) & (1 << i); +} + +template <> +inline bool VecMask::all_masked() const { + __mmask16 mask = _mm512_movepi32_mask(mask_[0]); + return mask == 0xffff; +} + +#define VEC_MASK_METHOD_WITH_CAST_TO_INT( \ + T, N, return_type, method, args_def, args) \ + template <> \ + inline return_type VecMask::method args_def const { \ + return cast().method args; \ + } + +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_zero, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, is_masked, (int i), (i)) +VEC_MASK_METHOD_WITH_CAST_TO_INT(float, 1, bool, all_masked, (), ()) +VEC_MASK_METHOD_WITH_CAST_TO_INT(int64_t, 2, bool, all_masked, (), ()) + +#undef VEC_MASK_DEFINE_METHOD_WITH_CAST_TO_INT + +#endif + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h index b03da5d2c3e95..21389da3cdc03 100644 --- a/aten/src/ATen/cpu/vec/vec512/vec512_qint.h +++ b/aten/src/ATen/cpu/vec/vec512/vec512_qint.h @@ -42,11 +42,17 @@ namespace at { namespace vec { inline namespace CPU_CAPABILITY { -#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) +#if defined(CPU_CAPABILITY_AVX512) +#ifdef _MSC_VER +__declspec(align(64)) struct Vectorizedqi { + protected: + __m512i vals; +#else struct Vectorizedqi { protected: __m512i vals __attribute__((aligned(64))); +#endif public: Vectorizedqi() {} @@ -98,28 +104,36 @@ inline __m512i pack_saturate_and_clamp( _mm512_min_epu8(packed_and_sat, _mm512_set1_epi8(max_val))); } -inline Vectorized convert_uint8_to_float(at::vec::Vectorized src) { +template +typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> +inline convert_int8_to_float(at::vec::Vectorized src) { // Note: this function only convert inputs number of elements equal to at::vec::Vectorized.size() - // Only handle first 128 bits + // Only handle first 16*8 bits __m128i input_128 = _mm512_castsi512_si128(src); - // Convert from 16*u8 to 16*int32 - __m512i input_512_extended = _mm512_cvtepu8_epi32(input_128); + // Convert from 16*uint8/int8 to 16*int32 + __m512i input_512_extended; + if constexpr (std::is_same_v) + input_512_extended = _mm512_cvtepu8_epi32(input_128); + else + input_512_extended = _mm512_cvtepi8_epi32(input_128); // Convert from 16*int32 to 16*float32 return _mm512_cvtepi32_ps(input_512_extended); } -inline Vectorized convert_float_to_uint8(at::vec::Vectorized src) { +template +typename std::enable_if_t || std::is_same_v, at::vec::Vectorized> +inline convert_float_to_int8(at::vec::Vectorized src) { // Convert from float32 to int32 with truncation __m512i x_values_int32 = _mm512_cvttps_epi32(src); // Convert from int32 to int16 using signed saturation __m512i xy_packed_v = _mm512_packs_epi32(x_values_int32, x_values_int32); - constexpr auto min_val = std::numeric_limits::min(); - constexpr auto max_val = std::numeric_limits::max(); + constexpr auto min_val = std::numeric_limits::min(); + constexpr auto max_val = std::numeric_limits::max(); - // Convert from int16 to uint8 using unsigned saturation - __m512i xyzw_clamped_v = pack_saturate_and_clamp( + // Convert from int16 to uint8/int8 using unsigned saturation + __m512i xyzw_clamped_v = pack_saturate_and_clamp( xy_packed_v, xy_packed_v, min_val, max_val); __m512i permute_mask_v = _mm512_set_epi32(0x0f, 0x0b, 0x07, 0x03, 0x0e, 0x0a, 0x06, 0x02, @@ -128,7 +142,7 @@ inline Vectorized convert_float_to_uint8(at::vec::Vectorized src } template -inline void __attribute__((always_inline)) QuantizeAvx512( +__FORCE_INLINE void QuantizeAvx512( const float* src, T* dst, int len, @@ -406,7 +420,7 @@ __m512i RequantizeAvx512( __m512 multiplier, __m512i zp) { static_assert( - std::is_same::value || std::is_same::value, + std::is_same_v || std::is_same_v, "Only int8_t/uint8_t are supported"); constexpr auto min_val = std::numeric_limits::min(); constexpr auto max_val = std::numeric_limits::max(); @@ -517,10 +531,17 @@ struct Vectorized : public Vectorizedqi { Vectorized scale, Vectorized zero_point, Vectorized scale_neg_zp_premul) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); @@ -541,10 +562,17 @@ struct Vectorized : public Vectorizedqi { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512 float_val0 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepi8_epi32(int_val1)); @@ -590,20 +618,34 @@ struct Vectorized : public Vectorizedqi { } int_vec_return_type widening_subtract(Vectorized b) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512i int32_val0 = cvtepi8_epi32(int_val0); __m512i int32_val1 = cvtepi8_epi32(int_val1); __m512i int32_val2 = cvtepi8_epi32(int_val2); __m512i int32_val3 = cvtepi8_epi32(int_val3); + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); + #else __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); + #endif __m512i int32_b0 = cvtepi8_epi32(int_b0); __m512i int32_b1 = cvtepi8_epi32(int_b1); @@ -713,10 +755,17 @@ struct Vectorized : public Vectorizedqi { Vectorized scale, Vectorized zero_point, Vectorized scale_zp_premul) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); @@ -738,10 +787,17 @@ struct Vectorized : public Vectorizedqi { float_vec_return_type dequantize( Vectorized scale, Vectorized zero_point) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512 float_val0 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val0)); __m512 float_val1 = _mm512_cvtepi32_ps(cvtepu8_epi32(int_val1)); @@ -788,20 +844,34 @@ struct Vectorized : public Vectorizedqi { } int_vec_return_type widening_subtract(Vectorized b) const { + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_val0 = _mm_set_epi64x(vals.m512i_u64[1], vals.m512i_u64[0]); + __m128i int_val1 = _mm_set_epi64x(vals.m512i_u64[3], vals.m512i_u64[2]); + __m128i int_val2 = _mm_set_epi64x(vals.m512i_u64[5], vals.m512i_u64[4]); + __m128i int_val3 = _mm_set_epi64x(vals.m512i_u64[7], vals.m512i_u64[6]); + #else __m128i int_val0 = _mm_set_epi64x(vals[1], vals[0]); __m128i int_val1 = _mm_set_epi64x(vals[3], vals[2]); __m128i int_val2 = _mm_set_epi64x(vals[5], vals[4]); __m128i int_val3 = _mm_set_epi64x(vals[7], vals[6]); + #endif __m512i int32_val0 = cvtepu8_epi32(int_val0); __m512i int32_val1 = cvtepu8_epi32(int_val1); __m512i int32_val2 = cvtepu8_epi32(int_val2); __m512i int32_val3 = cvtepu8_epi32(int_val3); + #if defined(_MSC_VER) && !defined(__clang__) + __m128i int_b0 = _mm_set_epi64x(b.vals.m512i_u64[1], b.vals.m512i_u64[0]); + __m128i int_b1 = _mm_set_epi64x(b.vals.m512i_u64[3], b.vals.m512i_u64[2]); + __m128i int_b2 = _mm_set_epi64x(b.vals.m512i_u64[5], b.vals.m512i_u64[4]); + __m128i int_b3 = _mm_set_epi64x(b.vals.m512i_u64[7], b.vals.m512i_u64[6]); + #else __m128i int_b0 = _mm_set_epi64x(b.vals[1], b.vals[0]); __m128i int_b1 = _mm_set_epi64x(b.vals[3], b.vals[2]); __m128i int_b2 = _mm_set_epi64x(b.vals[5], b.vals[4]); __m128i int_b3 = _mm_set_epi64x(b.vals[7], b.vals[6]); + #endif __m512i int32_b0 = cvtepu8_epi32(int_b0); __m512i int32_b1 = cvtepu8_epi32(int_b1); diff --git a/aten/src/ATen/cpu/vec/vec_base.h b/aten/src/ATen/cpu/vec/vec_base.h index e5955a802d016..d696c97b59497 100644 --- a/aten/src/ATen/cpu/vec/vec_base.h +++ b/aten/src/ATen/cpu/vec/vec_base.h @@ -26,18 +26,22 @@ #include #include #include -#include #include #include #include #include -#include #include #include #include #include #include +#if defined(__GNUC__) +#define __FORCE_INLINE __attribute__((always_inline)) inline +#elif defined(_MSC_VER) +#define __FORCE_INLINE __forceinline +#endif + // These macros helped us unify vec_base.h #ifdef CPU_CAPABILITY_AVX512 #if defined(__GNUC__) @@ -68,9 +72,9 @@ inline namespace CPU_CAPABILITY { template struct is_floating_point: std::integral_constant::value || - std::is_same::value || - std::is_same::value> { + std::is_floating_point_v || + std::is_same_v || + std::is_same_v> { }; template @@ -79,13 +83,23 @@ constexpr bool is_floating_point_v = is_floating_point::value; template struct is_reduced_floating_point: std::integral_constant::value || - std::is_same::value> { + std::is_same_v || + std::is_same_v> { }; template constexpr bool is_reduced_floating_point_v = is_reduced_floating_point::value; +template +struct is_8bit_integer: + std::integral_constant || + std::is_same_v> { +}; + +template +constexpr bool is_8bit_integer_v = is_8bit_integer::value; + template struct int_of_size; #define DEFINE_INT_OF_SIZE(int_t) \ @@ -147,9 +161,8 @@ struct Vectorized { // versions GCC/Clang have buggy determinations on whether or not an // identifier is odr-used or not, and in any case it's hard to tell if // a variable is odr-used or not. So best to just cut the problem at the root. - static constexpr size_type size_T = sizeof(T); // Workaround to compile with VS2022. static constexpr size_type size() { - return VECTOR_WIDTH / size_T; + return VECTOR_WIDTH / sizeof(T); } Vectorized() : values{static_cast(0)} {} Vectorized(T val) { @@ -231,6 +244,11 @@ struct Vectorized { std::memcpy(vector.values, ptr, count * sizeof(T)); return vector; } + static Vectorized loadu_one_fourth(const void* ptr) { + static_assert(std::is_same_v || std::is_same_v, "For byte types only"); + return Vectorized::loadu(ptr, 8); + } + void store(void* ptr, int count = size()) const { std::memcpy(ptr, values, count * sizeof(T)); } @@ -255,6 +273,14 @@ struct Vectorized { } return vector; } + bool has_inf_nan() const { + for (int64_t i = 0; i != size(); i++) { + if(_isnan(values[i]) || _isinf(values[i])) { + return true; + } + } + return false; + } Vectorized map(T (*const f)(T)) const { Vectorized ret; for (int64_t i = 0; i != size(); i++) { @@ -270,95 +296,98 @@ struct Vectorized { return ret; } template && !c10::is_complex::value, int>::type = 0> + typename std::enable_if_t && !c10::is_complex::value, int> = 0> Vectorized abs() const { // other_t_abs is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_abs must be T"); + static_assert(std::is_same_v, "other_t_abs must be T"); return map([](T x) -> T { return x < static_cast(0) ? -x : x; }); } template , int>::type = 0> + typename std::enable_if_t, int> = 0> Vectorized abs() const { // float_t_abs is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "float_t_abs must be T"); + static_assert(std::is_same_v, "float_t_abs must be T"); // Specifically deal with floating-point because the generic code above won't handle -0.0 (which should result in // 0.0) properly. return map([](T x) -> T { return std::abs(x); }); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized abs() const { // complex_t_abs is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_abs must be T"); + static_assert(std::is_same_v, "complex_t_abs must be T"); // Specifically map() does not perform the type conversion needed by abs. return map([](T x) { return static_cast(std::abs(x)); }); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized sgn() const { return map(at::native::sgn_impl); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized angle() const { // other_t_angle is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_angle must be T"); + static_assert(std::is_same_v, "other_t_angle must be T"); return map(at::native::angle_impl); // compiler is unable to resolve the overload without } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized angle() const { // complex_t_angle is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_angle must be T"); + static_assert(std::is_same_v, "complex_t_angle must be T"); return map([](T x) { return static_cast(std::arg(x)); }); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized real() const { // other_t_real is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_real must be T"); + static_assert(std::is_same_v, "other_t_real must be T"); return *this; } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized real() const { // complex_t_real is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_real must be T"); + static_assert(std::is_same_v, "complex_t_real must be T"); return map([](T x) { return static_cast(x.real()); }); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized imag() const { // other_t_imag is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_imag must be T"); + static_assert(std::is_same_v, "other_t_imag must be T"); return Vectorized(0); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized imag() const { // complex_t_imag is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_imag must be T"); + static_assert(std::is_same_v, "complex_t_imag must be T"); return map([](T x) { return static_cast(x.imag()); }); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized conj() const { // other_t_conj is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_conj must be T"); + static_assert(std::is_same_v, "other_t_conj must be T"); return *this; } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized conj() const { // complex_t_conj is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_conj must be T"); + static_assert(std::is_same_v, "complex_t_conj must be T"); return map([](T x) { return static_cast(std::conj(x)); }); } Vectorized acos() const { return map(std::acos); } + Vectorized acosh() const { + return map(std::acosh); + } Vectorized asin() const { return map(std::asin); } @@ -414,7 +443,7 @@ struct Vectorized { typename std::enable_if_t, int> = 0> Vectorized fmod(const Vectorized& q) const { // U is for SFINAE purposes only. Make sure it is not changed. - static_assert(std::is_same::value, "U must be T"); + static_assert(std::is_same_v, "U must be T"); Vectorized ret; for (const auto i : c10::irange(size())) { ret[i] = std::fmod(values[i], q[i]); @@ -431,17 +460,17 @@ struct Vectorized { return map(std::log1p); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized log2() const { // other_t_log2 is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "other_t_log2 must be T"); + static_assert(std::is_same_v, "other_t_log2 must be T"); return map(std::log2); } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized log2() const { // complex_t_log2 is for SFINAE and clarity. Make sure it is not changed. - static_assert(std::is_same::value, "complex_t_log2 must be T"); + static_assert(std::is_same_v, "complex_t_log2 must be T"); const T log_2 = T(std::log(2.0)); return Vectorized(map(std::log))/Vectorized(log_2); } @@ -613,6 +642,12 @@ template Vectorized inline operator/(const Vectorized &a, const return c; } +template , int> = 0> +Vectorized inline operator%(const Vectorized &a, const Vectorized &b) __ubsan_ignore_float_divide_by_zero__ { + return a - a / b * b; +} + template Vectorized inline operator||( const Vectorized &a, const Vectorized &b) { Vectorized c; @@ -625,7 +660,7 @@ template Vectorized inline operator||( // Implements the IEEE 754 201X `maximum` operation, which propagates NaN if // either input is a NaN. template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -641,7 +676,7 @@ Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -659,7 +694,7 @@ Vectorized inline maximum(const Vectorized &a, const Vectorized &b) { // Implements the IEEE 754 201X `minimum` operation, which propagates NaN if // either input is a NaN. template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -675,7 +710,7 @@ Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -691,7 +726,7 @@ Vectorized inline minimum(const Vectorized &a, const Vectorized &b) { } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline clamp(const Vectorized &a, const Vectorized &min_vec, const Vectorized &max_vec) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -701,7 +736,7 @@ Vectorized inline clamp(const Vectorized &a, const Vectorized &min_vec, } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline clamp_max(const Vectorized &a, const Vectorized &max_vec) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -711,7 +746,7 @@ Vectorized inline clamp_max(const Vectorized &a, const Vectorized &max_ } template ::value, int>::type = 0> + typename std::enable_if_t::value, int> = 0> Vectorized inline clamp_min(const Vectorized &a, const Vectorized &min_vec) { Vectorized c; for (int i = 0; i != Vectorized::size(); i++) { @@ -821,8 +856,8 @@ inline Vectorized operator^(const Vectorized& a, const Vectorized& b) { template>::value, int> = 0> inline Vectorized operator~(const Vectorized& a) { - Vectorized ones; // All bits are 1 - memset((T*) ones, 0xFF, VECTOR_WIDTH); + using int_t = int_same_size_t; + Vectorized ones(c10::bit_cast((int_t)(~(int_t)0))); // All bits are 1 return a ^ ones; } @@ -980,6 +1015,19 @@ inline Vectorized convert_to_int_of_same_size(const Vectorized& src) return Vectorized::loadu(static_cast(buffer.data())); } +template > +inline Vectorized convert_to_fp_of_same_size(const Vectorized& src) { + static_assert(sizeof(T) == sizeof(IntType)); + static constexpr int size = Vectorized::size(); + + std::array src_arr; + src.store(static_cast(src_arr.data())); + std::array buffer; + std::transform(src_arr.cbegin(), src_arr.cend(), buffer.begin(), + [](const IntType& x) { return static_cast(x); }); + return Vectorized::loadu(static_cast(buffer.data())); +} + // Example inputs for AVX512: // a Vectorized = {a0, b0, a1, b1, a2, b2, a3, b3, a4, b4, a5, b5, a6, b6, a7, b7} // b Vectorized = {a8, b8, a9, b9, a10, b10, a11, b11, a12, b12, a13, b13, a14, b14, a15, b15} @@ -1079,3 +1127,8 @@ inline void transpose_mxn(const T* src, int64_t ld_src, T* dst, int64_t ld_dst) } }} // namespace at::vec::CPU_CAPABILITY + +// additional headers for more operations that depend on vec_base +#include +#include +#include diff --git a/aten/src/ATen/cpu/vec/vec_convert.h b/aten/src/ATen/cpu/vec/vec_convert.h new file mode 100644 index 0000000000000..56488928156af --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec_convert.h @@ -0,0 +1,56 @@ +#pragma once + +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +struct VecConvert { + static inline VectorizedN apply( + const VectorizedN& src) { + constexpr int count = std::min( + VectorizedN::size(), VectorizedN::size()); + __at_align__ src_t src_buf[VectorizedN::size()]; + src.store(src_buf); + __at_align__ dst_t dst_buf[VectorizedN::size()]; + for (int i = 0; i < count; i++) { + dst_buf[i] = static_cast(src_buf[i]); + } + return VectorizedN::loadu(dst_buf, count); + } +}; + +template +inline Vectorized convert(const Vectorized& src) { + return VecConvert::apply(src); +} + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + std::enable_if_t = 0> +inline VectorizedN convert(const VectorizedN& src) { + return VecConvert::apply(src); +} + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + std::enable_if_t = 0> +inline Vectorized convert(const VectorizedN& src) { + return VecConvert::apply(src); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec_mask.h b/aten/src/ATen/cpu/vec/vec_mask.h new file mode 100644 index 0000000000000..90f0f98962d90 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec_mask.h @@ -0,0 +1,266 @@ +#pragma once + +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +/** + * The `VecMask` class provides a convenient interface for working with + * vectorized masks in SIMD operations. It encapsulates a `Vectorized` + * mask that can be directly usable in masked vectorized operations. It provides + * various methods for manipulating and accessing the mask elements: + * 1. `from` and `to`: Conversion between a vector of boolean values and a + * vectorized mask. + * 2. `cast`: Casts the mask to a different base type. + * 3. `all_zero`: Checks if all mask elements are zero. + * 4. `is_masked`: Checks if a specific element is masked. + * 5. `loadu`: Loads data from memory using the mask. + * 6. `all_masked`: Checks if all mask elements are masked. + * + * Some helper template classes are provided to simplify the specialization of + * the `VecMask` for the specific CPU arch: + * 1. `VecMaskLoad`: Loads data from memory using the mask. + * 2. `VecMaskTo`: Converts the mask to boolean. + * 3. `VecMaskCast`: Casts the mask to a different base type. + * + */ +template +class VecMask; + +template < + typename data_t, + int data_n, + typename mask_t, + int mask_n, + typename Enabled = void> +struct VecMaskLoad { + static inline VectorizedN apply( + const data_t* ptr, + const VecMask& vec_mask) { + constexpr typename VecMask::size_type size = + VecMask::size(); + static_assert(VectorizedN::size() >= size); + __at_align__ data_t data[size]; + __at_align__ mask_t mask[size]; + auto mask_ = VectorizedN(vec_mask); + mask_.store(mask); + for (int i = 0; i < size; i++) { + data[i] = mask[i] ? ptr[i] : static_cast(0); + } + return VectorizedN::loadu(data, size); + } +}; + +template < + typename dst_t, + int dst_n, + typename src_t, + int src_n, + typename Enabled = void> +struct VecMaskTo { + static inline VecMask apply( + const VecMask& vec_mask) { + auto zeros = VectorizedN(static_cast(0)); + auto ones = VectorizedN(static_cast(1)); + return VectorizedN::blendv( + zeros, ones, vec_mask.template cast()); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply( + const VecMask& vec_mask) { + return VecMask::from(VectorizedN(vec_mask)); + } +}; + +template +struct VecMaskCast { + static inline VecMask apply(const VecMask& vec_mask) { + return vec_mask; + } +}; + +template +class VecMask { + public: + using size_type = int; + static constexpr size_type size() { + return VectorizedN::size(); + } + + private: + VectorizedN mask_; + + public: + VecMask() : mask_(static_cast(0)) {} + VecMask(const VectorizedN& mask) : mask_(mask) {} + + template = 0> + VecMask(const Vectorized& mask) : mask_(mask) {} + + template + static VecMask from(const VectorizedN& b_vec) { + __at_align__ U b_buf[size()]; + if constexpr (size() >= VectorizedN::size()) { + b_vec.store(b_buf); + for (int i = VectorizedN::size(); i < size(); i++) { + b_buf[i] = static_cast(0); + } + } else { + b_vec.store(b_buf, size()); + } + return from(b_buf); + } + + template + static VecMask from(U b) { + using int_t = int_same_size_t; + T mask = b ? c10::bit_cast((int_t)(~(int_t)0)) : (T)0; + return VectorizedN(mask); + } + + template + static VecMask from(U* b) { + using int_t = int_same_size_t; + __at_align__ T mask[size()]; +#pragma unroll + for (int i = 0; i < size(); i++) { + *(int_t*)(mask + i) = b[i] ? ~(int_t)0 : (int_t)0; + } + return VectorizedN(VectorizedN::loadu(mask)); + } + + static VecMask blendv( + const VecMask& c, + const VecMask& b, + const VecMask& a) { + VectorizedN result = VectorizedN::blendv( + VectorizedN(c), + VectorizedN(b), + VectorizedN(a)); + return result; + } + + void store(bool* b, int count = size()) { + constexpr int L = (VectorizedN::size() + Vectorized::size() - 1)/ Vectorized::size(); + auto res = this->to(); + res.store(b, count); + return; + } + + template = 2, int> = 0> + inline VectorizedN to() const { + return VecMaskTo::apply(*this); + } + + template = 0> + inline Vectorized to() const { + return VecMaskTo::apply(*this); + } + + template + inline VecMask cast() const { + return VecMaskCast::apply(*this); + } + + inline bool all_zero() const { + __at_align__ T mask[size()]; + mask_.store(mask); + return std::all_of( + mask, mask + size(), [](T m) { return m == static_cast(0); }); + } + + inline bool all_masked() const { + __at_align__ T mask[size()]; + mask_.store(mask); + return std::all_of( + mask, mask + size(), [](T m) { return m != static_cast(0); }); + } + + inline bool is_masked(int i) const { + __at_align__ T mask[size()]; + mask_.store(mask); + return mask[i] != static_cast(0); + } + + inline operator VectorizedN() const { + return mask_; + } + + template = 0> + inline operator Vectorized() const { + return mask_[0]; + } + + inline Vectorized operator[](int i) const { + return mask_[i]; + } + + template < + typename U, + int L, + std::enable_if_t= 2 && VectorizedN::size() >= size(), int> = 0> + VectorizedN loadu(const U* ptr) const { + return VecMaskLoad::apply(ptr, *this); + } + + template < + typename U, + int L, + std::enable_if_t::size() >= size(), int> = 0> + Vectorized loadu(const U* ptr) const { + return VecMaskLoad::apply(ptr, *this); + } +}; + +#define VEC_MASK_DEFINE_UNARY_OP_GLOBAL(op) \ + template \ + inline VecMask op(const VecMask& a) { \ + return op(VectorizedN(a)); \ + } + +#define VEC_MASK_DEFINE_BINARY_OP_GLOBAL(op) \ + template < \ + typename T, \ + int N, \ + typename V, \ + int M, \ + std::enable_if_t::size() == VecMask::size(), int> = \ + 0> \ + inline VecMask op(const VecMask& a, const VecMask& b) { \ + return op( \ + VectorizedN(a), VectorizedN(b.template cast())); \ + } + +#define VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(op, EXPR) \ + template < \ + typename T, \ + int N, \ + typename V, \ + int M, \ + std::enable_if_t::size() == VecMask::size(), int> = \ + 0> \ + inline VecMask op(const VecMask& a, const VecMask& b) { \ + return EXPR; \ + } + +VEC_MASK_DEFINE_UNARY_OP_GLOBAL(operator~) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator&) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator|) +VEC_MASK_DEFINE_BINARY_OP_GLOBAL(operator^) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>, a & ~b) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<, ~a& b) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator==, ~(a ^ b)) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator>=, (a == b) | (a > b)) +VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL(operator<=, (a == b) | (a < b)) + +#undef VEC_MASK_DEFINE_UNARY_OP_GLOBAL +#undef VEC_MASK_DEFINE_BINARY_OP_GLOBAL +#undef VEC_MASK_DEFINE_BINARY_OP_WITH_EXPR_GLOBAL + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vec/vec_n.h b/aten/src/ATen/cpu/vec/vec_n.h new file mode 100644 index 0000000000000..5b0eb352d6627 --- /dev/null +++ b/aten/src/ATen/cpu/vec/vec_n.h @@ -0,0 +1,356 @@ +#pragma once + +#include +#include + +namespace at::vec { +inline namespace CPU_CAPABILITY { + +/** + * @brief A class template representing a vectorized type with + * `N * Vectorized::size()` elements, aiming to support vectors of + * arbitrary size. A specific use case of it is to represent vectors + * converted from data types with different sizes but with the same + * number of vector elements, e.g., `VectorizedN` can be + * a vector converted from two `Vectorized`, `VectorizedN` + * can be a vector converted from two `Vectorized` etc. + * + * It supports most of the operations of `Vectorized` + * and the implementation delegates to `Vectorized` with loops over `N`. + * + * @tparam T The underlying type of the vectorized elements. + * @tparam N The number of underlying `Vectorized`. + */ +template +class VectorizedN { + public: + using value_type = T; + using size_type = int; + + static constexpr size_type size_T = sizeof(T); + static constexpr size_type size() { + return Vectorized::size() * N; + } + + private: + std::array, N> values; + + public: + // methods not implemented yet: + // variadic constructor, operator T*, as_bytes, zero_mask + +#define VECTORIZEDN_DEFINE_UNARY_OP(op) \ + VectorizedN op() const { \ + return unary_op([](const Vectorized& a) { return a.op(); }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP(op) \ + VectorizedN op(const VectorizedN& other) const { \ + return binary_op( \ + other, [](const Vectorized& a, const Vectorized& b) { \ + return a.op(b); \ + }); \ + } + + template + inline VectorizedN unary_op(Op op) const { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result.values[i] = op(values[i]); + } + return result; + } + + template + inline VectorizedN binary_op(const VectorizedN& other, Op op) + const { + VectorizedN result; +#ifndef _MSC_VER +#pragma unroll +#endif + for (int i = 0; i < N; ++i) { + result.values[i] = op(values[i], other.values[i]); + } + return result; + } + + VectorizedN() = default; + + explicit VectorizedN(T val) { + for (int i = 0; i < N; ++i) { + values[i] = Vectorized(val); + } + } + + template = 0> + VectorizedN(const Vectorized& val) : values({val}) {} + + template = 0> + inline operator Vectorized() const { + return values[0]; + } + + inline const Vectorized& operator[](int i) const { + return values[i]; + } + + inline Vectorized& operator[](int i) { + return values[i]; + } + + template + static VectorizedN blend( + const VectorizedN& a, + const VectorizedN& b) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::template blend(a.values[i], b.values[i]); + } + return result; + } + + static VectorizedN blendv( + const VectorizedN& a, + const VectorizedN& b, + const VectorizedN& mask) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = + Vectorized::blendv(a.values[i], b.values[i], mask.values[i]); + } + return result; + } + + template + static VectorizedN arange( + T base = static_cast(0), + step_t step = static_cast(1)) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::arange(base, step); + base += step * Vectorized::size(); + } + return result; + } + + static VectorizedN set( + const VectorizedN& a, + const VectorizedN& b, + int64_t count = size()) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::set( + a.values[i], + b.values[i], + std::min(count, (int64_t)Vectorized::size())); + count -= Vectorized::size(); + if (count <= 0) { + break; + } + } + return result; + } + + static VectorizedN loadu(const void* ptr) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::loadu(ptr); + ptr = static_cast(ptr) + Vectorized::size(); + } + return result; + } + + static VectorizedN loadu(const void* ptr, int64_t count) { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = Vectorized::loadu( + ptr, std::min(count, (int64_t)Vectorized::size())); + ptr = static_cast(ptr) + Vectorized::size(); + count -= Vectorized::size(); + if (count <= 0) { + break; + } + } + return result; + } + + void store(void* ptr) const { + for (int i = 0; i < N; ++i) { + values[i].store(ptr); + ptr = static_cast(ptr) + Vectorized::size(); + } + } + + void store(void* ptr, int count) const { + for (int i = 0; i < N; ++i) { + values[i].store(ptr, std::min(count, (int)Vectorized::size())); + ptr = static_cast(ptr) + Vectorized::size(); + count -= Vectorized::size(); + if (count <= 0) { + break; + } + } + } + + bool has_inf_nan() const { + for (int i = 0; i < N; ++i) { + if (values[i].has_inf_nan()) { + return true; + } + } + return false; + } + + VectorizedN map(T (*const f)(T)) const { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = values[i].map(f); + } + return result; + } + + VectorizedN map(T (*const f)(const T&)) const { + VectorizedN result; + for (int i = 0; i < N; ++i) { + result.values[i] = values[i].map(f); + } + return result; + } + + VECTORIZEDN_DEFINE_UNARY_OP(abs) + VECTORIZEDN_DEFINE_UNARY_OP(sgn) + VECTORIZEDN_DEFINE_UNARY_OP(angle) + VECTORIZEDN_DEFINE_UNARY_OP(real) + VECTORIZEDN_DEFINE_UNARY_OP(imag) + VECTORIZEDN_DEFINE_UNARY_OP(conj) + VECTORIZEDN_DEFINE_UNARY_OP(acos) + VECTORIZEDN_DEFINE_UNARY_OP(acosh) + VECTORIZEDN_DEFINE_UNARY_OP(asin) + VECTORIZEDN_DEFINE_UNARY_OP(atan) + VECTORIZEDN_DEFINE_UNARY_OP(atanh) + VECTORIZEDN_DEFINE_BINARY_OP(atan2) + VECTORIZEDN_DEFINE_BINARY_OP(copysign) + VECTORIZEDN_DEFINE_UNARY_OP(erf) + VECTORIZEDN_DEFINE_UNARY_OP(erfc) + VECTORIZEDN_DEFINE_UNARY_OP(erfinv) + VECTORIZEDN_DEFINE_UNARY_OP(exp) + VECTORIZEDN_DEFINE_UNARY_OP(exp2) + VECTORIZEDN_DEFINE_UNARY_OP(expm1) + VECTORIZEDN_DEFINE_UNARY_OP(exp_u20) + VECTORIZEDN_DEFINE_UNARY_OP(frac) + VECTORIZEDN_DEFINE_BINARY_OP(fmod) + VECTORIZEDN_DEFINE_UNARY_OP(log) + VECTORIZEDN_DEFINE_UNARY_OP(log10) + VECTORIZEDN_DEFINE_UNARY_OP(log1p) + VECTORIZEDN_DEFINE_UNARY_OP(log2) + VECTORIZEDN_DEFINE_UNARY_OP(ceil) + VECTORIZEDN_DEFINE_UNARY_OP(cos) + VECTORIZEDN_DEFINE_UNARY_OP(cosh) + VECTORIZEDN_DEFINE_UNARY_OP(floor) + VECTORIZEDN_DEFINE_BINARY_OP(hypot) + VECTORIZEDN_DEFINE_UNARY_OP(i0) + VECTORIZEDN_DEFINE_UNARY_OP(i0e) + VECTORIZEDN_DEFINE_UNARY_OP(digamma) + VECTORIZEDN_DEFINE_BINARY_OP(igamma) + VECTORIZEDN_DEFINE_BINARY_OP(igammac) + VECTORIZEDN_DEFINE_UNARY_OP(neg) + VECTORIZEDN_DEFINE_BINARY_OP(nextafter) + VECTORIZEDN_DEFINE_UNARY_OP(round) + VECTORIZEDN_DEFINE_UNARY_OP(sin) + VECTORIZEDN_DEFINE_UNARY_OP(sinh) + VECTORIZEDN_DEFINE_UNARY_OP(tan) + VECTORIZEDN_DEFINE_UNARY_OP(tanh) + VECTORIZEDN_DEFINE_UNARY_OP(trunc) + VECTORIZEDN_DEFINE_UNARY_OP(lgamma) + VECTORIZEDN_DEFINE_UNARY_OP(sqrt) + VECTORIZEDN_DEFINE_UNARY_OP(reciprocal) + VECTORIZEDN_DEFINE_UNARY_OP(rsqrt) + VECTORIZEDN_DEFINE_BINARY_OP(pow) + VECTORIZEDN_DEFINE_BINARY_OP(operator==) + VECTORIZEDN_DEFINE_BINARY_OP(operator!=) + VECTORIZEDN_DEFINE_BINARY_OP(operator>=) + VECTORIZEDN_DEFINE_BINARY_OP(operator<=) + VECTORIZEDN_DEFINE_BINARY_OP(operator>) + VECTORIZEDN_DEFINE_BINARY_OP(operator<) + VECTORIZEDN_DEFINE_BINARY_OP(eq) + VECTORIZEDN_DEFINE_BINARY_OP(ne) + VECTORIZEDN_DEFINE_BINARY_OP(gt) + VECTORIZEDN_DEFINE_BINARY_OP(ge) + VECTORIZEDN_DEFINE_BINARY_OP(lt) + VECTORIZEDN_DEFINE_BINARY_OP(le) + +#undef VECTORIZEDN_DEFINE_UNARY_OP +#undef VECTORIZEDN_DEFINE_BINARY_OP +}; + +#define VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(op) \ + template \ + inline VectorizedN op(const VectorizedN& a) { \ + return a.unary_op([](const Vectorized& a) { return op(a); }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(op) \ + template \ + inline VectorizedN op( \ + const VectorizedN& a, const VectorizedN& b) { \ + return a.binary_op(b, [](const Vectorized& a, const Vectorized& b) { \ + return op(a, b); \ + }); \ + } + +#define VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(op) \ + template \ + inline VectorizedN& op( \ + VectorizedN& a, const VectorizedN& b) { \ + a = a.binary_op(b, [](const Vectorized& a, const Vectorized& b) { \ + return op(a, b); \ + }); \ + return a; \ + } + +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator+) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator-) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator*) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator/) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator%) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator||) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator<<) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator>>) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(maximum) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(minimum) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmadd) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(fmsub) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_max) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(clamp_min) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator&) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator|) +VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL(operator^) +VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL(operator~) + +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator+=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator-=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator*=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator/=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator%=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator<<=) +VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL(operator>>=) + +#undef VECTORIZEDN_DEFINE_UNARY_OP_GLOBAL +#undef VECTORIZEDN_DEFINE_BINARY_OP_GLOBAL +#undef VECTORIZEDN_DEFINE_BINARY_OP_INPLACE_GLOBAL + +template +inline T vec_reduce_all(const OpVec& vec_fun, VectorizedN acc_vec) { + Vectorized vec_result = acc_vec[0]; + for (int i = 1; i < N; i++) { + vec_result = vec_fun(vec_result, acc_vec[i]); + } + return vec_reduce_all(vec_fun, vec_result); +} + +} // namespace CPU_CAPABILITY +} // namespace at::vec diff --git a/aten/src/ATen/cpu/vml.h b/aten/src/ATen/cpu/vml.h index ba99583b02b23..fe57a27a04d9f 100644 --- a/aten/src/ATen/cpu/vml.h +++ b/aten/src/ATen/cpu/vml.h @@ -100,11 +100,11 @@ IMPLEMENT_VML(lgamma) #if AT_MKL_ENABLED() && !defined(__APPLE__) // NB: LP64 MKL is the most commonly used and thus we assume it here. That means -// we need to expect MKL_INT to be of type int, which implies int32_t in most +// we need to expect MKL_INT to be of type int, which implies int32_t or int64_t in most // cases. static_assert( - std::is_same::value, - "MKL_INT is assumed to be int32_t"); + std::is_same_v || std::is_same_v, + "MKL_INT is assumed to be int32_t or int64_t"); #define IMPLEMENT_VML_MKL_STUB(op, mklop, type, mkltype) \ template <> \ inline void v##op(type * out, const type * in, int64_t size) { \ diff --git a/aten/src/ATen/cuda/ApplyGridUtils.cuh b/aten/src/ATen/cuda/ApplyGridUtils.cuh index 18ce3ba34e87c..b0b1412298d7b 100644 --- a/aten/src/ATen/cuda/ApplyGridUtils.cuh +++ b/aten/src/ATen/cuda/ApplyGridUtils.cuh @@ -20,7 +20,7 @@ constexpr uint32_t AT_APPLY_THREADS_PER_BLOCK = 512; constexpr uint32_t AT_APPLY_BLOCKS_PER_SM = 4; template -inline bool getApplyGrid(uint64_t totalElements, dim3& grid, int64_t curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { +inline bool getApplyGrid(uint64_t totalElements, dim3& grid, c10::DeviceIndex curDevice, int max_threads_per_block=AT_APPLY_THREADS_PER_BLOCK) { if (curDevice == -1) return false; uint64_t numel_per_thread = static_cast(max_threads_per_block) * static_cast(step); uint64_t numBlocks = ATenCeilDiv(totalElements, numel_per_thread); diff --git a/aten/src/ATen/cuda/CUDAApplyUtils.cuh b/aten/src/ATen/cuda/CUDAApplyUtils.cuh index cd857c00988ba..4dcdabf17b3b9 100644 --- a/aten/src/ATen/cuda/CUDAApplyUtils.cuh +++ b/aten/src/ATen/cuda/CUDAApplyUtils.cuh @@ -403,7 +403,7 @@ inline bool CUDA_tensor_apply2(at::TensorBase a, const dim3 block = getApplyBlock(max_threads_per_block); dim3 grid; - int64_t curDevice = current_device(); + auto curDevice = current_device(); if (curDevice == -1) return false; if (!getApplyGrid(totalElements, grid, curDevice, max_threads_per_block)) { return false; diff --git a/aten/src/ATen/cuda/CUDABlas.cpp b/aten/src/ATen/cuda/CUDABlas.cpp index 9ae84c418d255..bfe6a02741ede 100644 --- a/aten/src/ATen/cuda/CUDABlas.cpp +++ b/aten/src/ATen/cuda/CUDABlas.cpp @@ -6,12 +6,17 @@ #include #include #include +#include +#include #include #include #include #include #ifdef USE_ROCM +#if ROCM_VERSION >= 60000 +#include +#endif // until hipblas has an API to accept flags, we must use rocblas here #include #include @@ -190,10 +195,10 @@ static size_t _parseChosenWorkspaceSize() { workspace_size = std::stoi(val); } catch(std::invalid_argument const& e) { TORCH_WARN("invalid CUBLASLT_WORKSPACE_SIZE,", - " using default workspace size of ", workspace_size, " bytes."); + " using default workspace size of ", workspace_size, " KiB."); } catch(std::out_of_range const& e) { TORCH_WARN("CUBLASLT_WORKSPACE_SIZE out of range,", - " using default workspace size of ", workspace_size, " bytes."); + " using default workspace size of ", workspace_size, " KiB."); } } return workspace_size * 1024; @@ -231,8 +236,289 @@ namespace at::cuda::blas { CUDABLAS_NONNEGINT_CHECK(bgemm, num_batches); \ } while (0) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) + +#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000 +// only for rocm 5.7 where we first supported hipblaslt, it was difficult +// to hipify correctly without this change. +#define hipDataType hipblasDatatype_t +#endif + +// hipblaslt custom types were a temporary work-around +#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_DATA_TYPE) +hipblasltDatatype_t hipToLt(hipDataType type) { + switch (type) { + case HIP_R_32F: return HIPBLASLT_R_32F; + case HIP_R_64F: return HIPBLASLT_R_64F; + case HIP_R_16F: return HIPBLASLT_R_16F; + case HIP_R_8I: return HIPBLASLT_R_8I; + case HIP_C_32F: return HIPBLASLT_C_32F; + case HIP_C_64F: return HIPBLASLT_C_64F; + case HIP_C_16F: return HIPBLASLT_C_16F; + case HIP_C_8I: return HIPBLASLT_C_8I; + case HIP_R_8U: return HIPBLASLT_R_8U; + case HIP_C_8U: return HIPBLASLT_C_8U; + case HIP_R_32I: return HIPBLASLT_R_32I; + case HIP_C_32I: return HIPBLASLT_C_32I; + case HIP_R_32U: return HIPBLASLT_R_32U; + case HIP_C_32U: return HIPBLASLT_C_32U; + case HIP_R_16BF: return HIPBLASLT_R_16B; + case HIP_C_16BF: return HIPBLASLT_C_16B; + default: TORCH_CHECK(false, "unknown hipDataType"); + } +} +#define HIPTOLT(type) hipToLt(type) +#else +#define HIPTOLT(type) type +#endif + +#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && defined(HIPBLASLT_CUSTOM_COMPUTE_TYPE) +hipblasLtComputeType_t hipblasToLt(hipblasComputeType_t type) { + switch (type) { + case HIPBLAS_COMPUTE_32F: return HIPBLASLT_COMPUTE_F32; + case HIPBLAS_COMPUTE_32F_FAST_16F: return HIPBLASLT_COMPUTE_F32_FAST_F16; + case HIPBLAS_COMPUTE_32F_FAST_TF32: return HIPBLASLT_COMPUTE_F32_FAST_XF32; + case HIPBLAS_COMPUTE_64F: return HIPBLASLT_COMPUTE_F64; + case HIPBLAS_COMPUTE_32I: return HIPBLASLT_COMPUTE_I32; + default: TORCH_CHECK(false, "unknown hipblasComputeType_t"); + } +} +#define HIPCOMPTOLT(type) hipblasToLt(type) +#else +#define HIPCOMPTOLT(type) type +#endif + +namespace { +// Following the pattern of CuSparseDescriptor +// Defined here for now because this is the only place cublas_lt interface is +// used but can be moved to a header once cublas_lt interface is used in +// multiple places. +template +struct CuBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class CuBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< + cublasLtMatmulDescOpaque_t, + &cublasLtMatmulDescDestroy> { + public: + CuBlasLtMatmulDescriptor( + cublasComputeType_t compute_type, + cudaDataType_t scale_type) { + cublasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatmulDescCreate(&raw_descriptor, HIPCOMPTOLT(compute_type), HIPTOLT(scale_type))); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { + TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; + +class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< + cublasLtMatrixLayoutOpaque_t, + &cublasLtMatrixLayoutDestroy> { + public: + CuBlasLtMatrixLayout( + cudaDataType_t type, + uint64_t rows, + uint64_t cols, + int64_t ld, + bool t = false) { + cublasLtMatrixLayout_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK( + cublasLtMatrixLayoutCreate(&raw_descriptor, HIPTOLT(type), t ? cols : rows, t ? rows : cols, ld)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatrixLayoutAttribute_t attr, const T value) { + TORCH_CUDABLAS_CHECK(::cublasLtMatrixLayoutSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; + +class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< + cublasLtMatmulPreferenceOpaque_t, + &cublasLtMatmulPreferenceDestroy> { + public: + CuBlasLtMatmulPreference() { + cublasLtMatmulPreference_t raw_descriptor = nullptr; + TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { + TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; +} // namespace + +#endif + +template +inline void bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) + cudaDataType_t abcType = CUDA_R_32F; + cublasComputeType_t computeType = CUBLAS_COMPUTE_32F; + cudaDataType_t scaleType = CUDA_R_32F; + if constexpr (std::is_same_v) { + abcType = CUDA_R_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_R_64F; + } else if constexpr (std::is_same_v) { +#ifndef USE_ROCM + if (at::globalContext().allowTF32CuBLAS()) { + computeType = CUBLAS_COMPUTE_32F_FAST_TF32; + } +#endif + } else if constexpr (std::is_same_v>) { + abcType = CUDA_C_64F; + computeType = CUBLAS_COMPUTE_64F; + scaleType = CUDA_C_64F; + } else if constexpr (std::is_same_v>) { + abcType = CUDA_C_32F; + scaleType = CUDA_C_32F; + } else if constexpr (std::is_same_v) { + abcType = CUDA_R_16F; + } else if constexpr (std::is_same_v) { + abcType = CUDA_R_16BF; + } else { + AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name()); + } + + globalContext().alertCuBLASConfigNotDeterministic(); + cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); + cublasOperation_t opa = _cublasOpFromChar(transa); + cublasOperation_t opb = _cublasOpFromChar(transb); + _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); + + CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, opa); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, opb); + CuBlasLtMatrixLayout Adesc(abcType, m, k, lda, opa == CUBLAS_OP_T); + CuBlasLtMatrixLayout Bdesc(abcType, k, n, ldb, opb == CUBLAS_OP_T); + CuBlasLtMatrixLayout Cdesc(abcType, m, n, ldc); + + if (num_batches > 1) { + int num_batches_as_int = static_cast(num_batches); + Adesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int); + Bdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int); + Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_BATCH_COUNT, num_batches_as_int); + Adesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridea); + Bdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, strideb); + Cdesc.setAttribute(CUBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, stridec); + } + + CuBlasLtMatmulPreference preference; + // See https://github.com/pytorch/pytorch/issues/73328 for reasoning behind + // setting this to 1M. + size_t workspaceSize = _getWorkspaceSize(); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize); + +#ifndef USE_ROCM + uint32_t a_alignment = _getAlignment(reinterpret_cast(a)); + uint32_t b_alignment = _getAlignment(reinterpret_cast(b)); + uint32_t c_alignment = _getAlignment(reinterpret_cast(c)); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_A_BYTES, a_alignment); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_B_BYTES, b_alignment); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MIN_ALIGNMENT_C_BYTES, c_alignment); +#endif + + auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); + auto workspace = allocator.allocate(workspaceSize); + TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt"); + + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + int returnedResult = 0; + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + if (returnedResult == 0) { + TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); + } + + cublasStatus_t cublasStatus = cublasLtMatmul( + ltHandle, + computeDesc.descriptor(), + &alpha, + a, + Adesc.descriptor(), + b, + Bdesc.descriptor(), + &beta, + c, + Cdesc.descriptor(), + c, + Cdesc.descriptor(), + &heuristicResult.algo, + workspace.mutable_get(), + workspaceSize, + at::cuda::getCurrentCUDAStream()); + TORCH_CHECK( + cublasStatus == CUBLAS_STATUS_SUCCESS, + "CUDA error: ", + at::cuda::blas::_cublasGetErrorEnum(cublasStatus), + " when calling cublasLtMatmul with transpose_mat1 ", + (opa == CUBLAS_OP_T), + " transpose_mat2 ", + (opb == CUBLAS_OP_T), + " m ", + m, + " n ", + n, + " k ", + k, + " lda ", + lda, + " ldb ", + ldb, + " ldc ", + ldc, + " abcType ", + abcType, + " computeType ", + computeType, + " scaleType ", + scaleType); +#else + AT_ERROR("at::cuda::blas::bgemm_internal_cublaslt: not implemented for ", typeid(Dtype).name()); +#endif +} + + +template +inline void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + AT_ERROR("at::cuda::blas::bgemm_internal_cublas: not implemented for ", typeid(Dtype).name()); +} + template <> -void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -245,7 +531,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { } template <> -void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -258,7 +544,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { } template <> -void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { +void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -273,7 +559,7 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) } template <> -void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { +void bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -288,7 +574,7 @@ void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { } template <> -void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -335,7 +621,7 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { } template <> -void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { +void bgemm_internal_cublas(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); BGEMM_CHECK_ARGVALUES(at::BFloat16); @@ -346,23 +632,226 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { const float fbeta = beta; _cublasAdjustLdLevel3(transa, transb, m, n, k, &lda, &ldb, &ldc); -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 - auto compute_type = CUBLAS_COMPUTE_32F; -#else - auto compute_type = CUDA_R_32F; -#endif - TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle, - opa, opb, (int)m, (int)n, (int)k, - (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, - b, CUDA_R_16BF, (int)ldb, strideb, - (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, - (int)num_batches, - compute_type, - CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +#if defined(USE_ROCM) && ROCM_VERSION >= 60000 + auto compute_type = CUBLAS_COMPUTE_32F; +#else + auto compute_type = CUDA_R_32F; +#endif + TORCH_CUDABLAS_CHECK(cublasGemmStridedBatchedEx(handle, + opa, opb, (int)m, (int)n, (int)k, + (void*)&falpha, a, CUDA_R_16BF, (int)lda, stridea, + b, CUDA_R_16BF, (int)ldb, strideb, + (void*)&fbeta, c, CUDA_R_16BF, (int)ldc, stridec, + (int)num_batches, + compute_type, + CUBLAS_GEMM_DEFAULT_TENSOR_OP)); +} + +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support double gemm yet + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); +#else + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(double)); +#endif + } + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(double)); + } +} + +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(float)); + } + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(float)); + } +} + +template <> +void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support complex gemm yet + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); +#else + bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex)); +#endif + } + else { + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +} + +template <> +void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support complex gemm yet + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); +#else + bgemm_internal_cublaslt>(CUDABLAS_BGEMM_ARGS(c10::complex)); +#endif + } + else { + bgemm_internal_cublas>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +} + +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::Half)); + } + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::Half)); + } +} + +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + bgemm_internal_cublaslt(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + else { + bgemm_internal_cublas(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +} + +template +inline void bgemm_tunable(CUDABLAS_BGEMM_ARGTYPES(DType)) { + tunable::GemmStridedBatchedParams params; + params.transa = transa; + params.transb = transb; + params.m = m; + params.n = n; + params.k = k; + params.alpha = alpha; + params.a = a; + params.lda = lda; + params.stride_a = stridea; + params.b = b; + params.ldb = ldb; + params.stride_b = strideb; + params.beta = beta; + params.c = c; + params.ldc = ldc; + params.stride_c = stridec; + params.batch = num_batches; + + bool transa_ = ((transa != 'n') && (transa != 'N')); + bool transb_ = ((transb != 'n') && (transb != 'N')); + + if (transa_ && transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (transa_ && !transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (!transa_ && transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else if (!transa_ && !transb_) { + static tunable::GemmStridedBatchedTunableOp bgemm{}; + bgemm(¶ms); + } + else { + TORCH_CHECK(false, "unreachable"); + } +} + +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES(double)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable(CUDABLAS_BGEMM_ARGS(double)); + } + else { + bgemm_internal(CUDABLAS_BGEMM_ARGS(double)); + } +} + +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES(float)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable(CUDABLAS_BGEMM_ARGS(float)); + } + else { + bgemm_internal(CUDABLAS_BGEMM_ARGS(float)); + } +} + +template <> +void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } + else { + bgemm_internal>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +} + +template <> +void bgemm>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } + else { + bgemm_internal>(CUDABLAS_BGEMM_ARGS(c10::complex)); + } +} + +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable(CUDABLAS_BGEMM_ARGS(at::Half)); + } + else { + bgemm_internal(CUDABLAS_BGEMM_ARGS(at::Half)); + } +} + +template <> +void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + bgemm_tunable(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } + else { + bgemm_internal(CUDABLAS_BGEMM_ARGS(at::BFloat16)); + } +} + +template +inline void gemm_internal_cublaslt(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + // forward to bgemm implementation but set strides and batches to 0 + bgemm_internal_cublaslt(transa, transb, m, n, k, alpha, a, lda, 0, b, ldb, 0, beta, c, ldc, 0, 0); +} + +template +inline void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + AT_ERROR("at::cuda::blas::gemm_internal_cublas: not implemented for ", typeid(Dtype).name()); } template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(double)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -375,7 +864,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { } template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(float)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -388,7 +877,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { } template <> -void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { +void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -403,7 +892,7 @@ void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { } template <> -void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { +void gemm_internal_cublas>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -418,7 +907,7 @@ void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { } template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::Half)) { // See Note [Writing Nondeterministic Operations] globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); @@ -514,7 +1003,7 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { } template <> -void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { +void gemm_internal_cublas(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { globalContext().alertCuBLASConfigNotDeterministic(); cublasHandle_t handle = at::cuda::getCurrentCUDABlasHandle(); cublasOperation_t opa = _cublasOpFromChar(transa); @@ -558,136 +1047,195 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); } -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) - -#if defined(USE_ROCM) && ROCM_VERSION >= 50700 && ROCM_VERSION < 60000 -// only for rocm 5.7 where we first supported hipblaslt, it was difficult -// to hipify correctly without this change. -#define hipDataType hipblasDatatype_t +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support double gemm yet + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); +#else + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(double)); #endif + } + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(double)); + } +} -// hipblaslt custom types were a temporary work-around -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && HIPBLASLT_CUSTOM_DATA_TYPE -hipblasltDatatype_t hipToLt(hipDataType type) { - switch (type) { - case HIP_R_32F: return HIPBLASLT_R_32F; - case HIP_R_64F: return HIPBLASLT_R_64F; - case HIP_R_16F: return HIPBLASLT_R_16F; - case HIP_R_8I: return HIPBLASLT_R_8I; - case HIP_C_32F: return HIPBLASLT_C_32F; - case HIP_C_64F: return HIPBLASLT_C_64F; - case HIP_C_16F: return HIPBLASLT_C_16F; - case HIP_C_8I: return HIPBLASLT_C_8I; - case HIP_R_8U: return HIPBLASLT_R_8U; - case HIP_C_8U: return HIPBLASLT_C_8U; - case HIP_R_32I: return HIPBLASLT_R_32I; - case HIP_C_32I: return HIPBLASLT_C_32I; - case HIP_R_32U: return HIPBLASLT_R_32U; - case HIP_C_32U: return HIPBLASLT_C_32U; - case HIP_R_16BF: return HIPBLASLT_R_16B; - case HIP_C_16BF: return HIPBLASLT_C_16B; - default: TORCH_CHECK(false); - } +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(float)); + } + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(float)); + } } -#define HIPTOLT(type) hipToLt(type) + +template <> +void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); #else -#define HIPTOLT(type) type + gemm_internal_cublaslt>(CUDABLAS_GEMM_ARGS(c10::complex)); #endif - -#if defined(USE_ROCM) && ROCM_VERSION >= 60000 && HIPBLASLT_CUSTOM_COMPUTE_TYPE -hipblasLtComputeType_t hipblasToLt(hipblasComputeType_t type) { - switch (type) { - case HIPBLAS_COMPUTE_32F: return HIPBLASLT_COMPUTE_F32; - case HIPBLAS_COMPUTE_32F_FAST_16F: return HIPBLASLT_COMPUTE_F32_FAST_F16; - case HIPBLAS_COMPUTE_32F_FAST_TF32: return HIPBLASLT_COMPUTE_F32_FAST_XF32; - case HIPBLAS_COMPUTE_64F: return HIPBLASLT_COMPUTE_F64; - case HIPBLAS_COMPUTE_32I: return HIPBLASLT_COMPUTE_I32; - default: TORCH_CHECK(false); - } + } + else { + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + } } -#define HIPCOMPTOLT(type) hipblasToLt(type) + +template <> +void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { +#ifdef USE_ROCM + // hipblaslt does not support complex gemm yet + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); #else -#define HIPCOMPTOLT(type) type + gemm_internal_cublaslt>(CUDABLAS_GEMM_ARGS(c10::complex)); #endif + } + else { + gemm_internal_cublas>(CUDABLAS_GEMM_ARGS(c10::complex)); + } +} -namespace { -// Following the pattern of CuSparseDescriptor -// Defined here for now because this is the only place cublas_lt interface is -// used but can be moved to a header once cublas_lt interface is used in -// multiple places. -template -struct CuBlasLtDeleter { - void operator()(T* x) { - if (x != nullptr) { - TORCH_CUDABLAS_CHECK(destructor(x)); - } +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::Half)); } -}; + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::Half)); + } +} -template -class CuBlasLtDescriptor { - public: - T* descriptor() const { - return descriptor_.get(); +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) +{ + if (at::globalContext().blasPreferredBackend() == BlasBackend::Cublaslt) { + gemm_internal_cublaslt(CUDABLAS_GEMM_ARGS(at::BFloat16)); } - T* descriptor() { - return descriptor_.get(); + else { + gemm_internal_cublas(CUDABLAS_GEMM_ARGS(at::BFloat16)); } +} - protected: - std::unique_ptr> descriptor_; -}; +template +inline void gemm_tunable(CUDABLAS_GEMM_ARGTYPES(DType)) { + tunable::GemmParams params; + params.transa = transa; + params.transb = transb; + params.m = m; + params.n = n; + params.k = k; + params.alpha = alpha; + params.a = a; + params.lda = lda; + params.b = b; + params.ldb = ldb; + params.beta = beta; + params.c = c; + params.ldc = ldc; -class CuBlasLtMatmulDescriptor : public CuBlasLtDescriptor< - cublasLtMatmulDescOpaque_t, - &cublasLtMatmulDescDestroy> { - public: - CuBlasLtMatmulDescriptor( - cublasComputeType_t compute_type, - cudaDataType_t scale_type) { - cublasLtMatmulDesc_t raw_descriptor = nullptr; - TORCH_CUDABLAS_CHECK( - cublasLtMatmulDescCreate(&raw_descriptor, HIPCOMPTOLT(compute_type), HIPTOLT(scale_type))); - descriptor_.reset(raw_descriptor); + bool transa_ = ((transa != 'n') && (transa != 'N')); + bool transb_ = ((transb != 'n') && (transb != 'N')); + + if (transa_ && transb_) { + static tunable::GemmTunableOp gemm{}; + gemm(¶ms); } - template - inline void setAttribute(cublasLtMatmulDescAttributes_t attr, const T value) { - TORCH_CUDABLAS_CHECK(::cublasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T))); + else if (transa_ && !transb_) { + static tunable::GemmTunableOp gemm{}; + gemm(¶ms); } -}; + else if (!transa_ && transb_) { + static tunable::GemmTunableOp gemm{}; + gemm(¶ms); + } + else if (!transa_ && !transb_) { + static tunable::GemmTunableOp gemm{}; + gemm(¶ms); + } + else { + TORCH_CHECK(false, "unreachable"); + } +} -class CuBlasLtMatrixLayout : public CuBlasLtDescriptor< - cublasLtMatrixLayoutOpaque_t, - &cublasLtMatrixLayoutDestroy> { - public: - CuBlasLtMatrixLayout( - cudaDataType_t type, - uint64_t rows, - uint64_t cols, - int64_t ld, - bool t = false) { - cublasLtMatrixLayout_t raw_descriptor = nullptr; - TORCH_CUDABLAS_CHECK( - cublasLtMatrixLayoutCreate(&raw_descriptor, HIPTOLT(type), t ? cols : rows, t ? rows : cols, ld)); - descriptor_.reset(raw_descriptor); +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES(double)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable(CUDABLAS_GEMM_ARGS(double)); } -}; + else { + gemm_internal(CUDABLAS_GEMM_ARGS(double)); + } +} -class CuBlasLtMatmulPreference : public CuBlasLtDescriptor< - cublasLtMatmulPreferenceOpaque_t, - &cublasLtMatmulPreferenceDestroy> { - public: - CuBlasLtMatmulPreference() { - cublasLtMatmulPreference_t raw_descriptor = nullptr; - TORCH_CUDABLAS_CHECK(cublasLtMatmulPreferenceCreate(&raw_descriptor)); - descriptor_.reset(raw_descriptor); +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES(float)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable(CUDABLAS_GEMM_ARGS(float)); } - template - inline void setAttribute(cublasLtMatmulPreferenceAttributes_t attr, const T value) { - TORCH_CUDABLAS_CHECK(::cublasLtMatmulPreferenceSetAttribute(descriptor(), attr, &value, sizeof(T))); + else { + gemm_internal(CUDABLAS_GEMM_ARGS(float)); } -}; -} // namespace +} + +template <> +void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable>(CUDABLAS_GEMM_ARGS(c10::complex)); + } + else { + gemm_internal>(CUDABLAS_GEMM_ARGS(c10::complex)); + } +} + +template <> +void gemm>(CUDABLAS_GEMM_ARGTYPES(c10::complex)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable>(CUDABLAS_GEMM_ARGS(c10::complex)); + } + else { + gemm_internal>(CUDABLAS_GEMM_ARGS(c10::complex)); + } +} + +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable(CUDABLAS_GEMM_ARGS(at::Half)); + } + else { + gemm_internal(CUDABLAS_GEMM_ARGS(at::Half)); + } +} + +template <> +void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)) { + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + gemm_tunable(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } + else { + gemm_internal(CUDABLAS_GEMM_ARGS(at::BFloat16)); + } +} + +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) template void gemm_and_bias( @@ -745,8 +1293,11 @@ void gemm_and_bias( epilogue = CUBLASLT_EPILOGUE_GELU_BIAS; #endif } - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); - computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias); + + if (bias != nullptr) { + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_EPILOGUE, epilogue); + computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias); + } CuBlasLtMatrixLayout Adesc(abcType, m, k, mat1_ld, transpose_mat1); CuBlasLtMatrixLayout Bdesc(abcType, k, n, mat2_ld, transpose_mat2); @@ -771,6 +1322,7 @@ void gemm_and_bias( auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); auto workspace = allocator.allocate(workspaceSize); + TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt"); cublasLtMatmulHeuristicResult_t heuristicResult = {}; int returnedResult = 0; @@ -921,21 +1473,32 @@ void scaled_gemm( ScalarType result_dtype, void* amax_ptr, bool use_fast_accum) { - #if CUDA_VERSION >= 11080 +#if CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000) const auto computeType = CUBLAS_COMPUTE_32F; const auto scaleType = CUDA_R_32F; const int8_t fastAccuMode = use_fast_accum ? 1 : 0; + const float alpha_val = 1.0; + const float beta_val = 0.0; CuBlasLtMatmulDescriptor computeDesc(computeType, scaleType); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSA, _cublasOpFromChar(transa)); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_TRANSB, _cublasOpFromChar(transb)); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr); computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); +#ifndef USE_ROCM +if (isFloat8Type(result_dtype)) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_AMAX_D_POINTER, amax_ptr); +} computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_FAST_ACCUM, fastAccuMode); +#endif CuBlasLtMatrixLayout Adesc(ScalarTypeToCudaDataType(mat1_dtype), m, k, mat1_ld, transa == 't'); CuBlasLtMatrixLayout Bdesc(ScalarTypeToCudaDataType(mat2_dtype), k, n, mat2_ld, transb == 't'); +#ifdef USE_ROCM + // Cdesc is unused, beta is 0. But hipblaslt needs this set to something reasonable. + CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld); +#else CuBlasLtMatrixLayout Cdesc(ScalarTypeToCudaDataType(bias_dtype), m, n, result_ld); +#endif CuBlasLtMatrixLayout Ddesc(ScalarTypeToCudaDataType(result_dtype), m, n, result_ld); if (bias_ptr) { computeDesc.setAttribute(CUBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr); @@ -945,6 +1508,7 @@ void scaled_gemm( size_t workspaceSize = _getWorkspaceSize(); auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); auto workspace = allocator.allocate(workspaceSize); + TORCH_CHECK(workspace.get() != nullptr, "OOM trying to allocate workspace for cublaslt"); CuBlasLtMatmulPreference preference; preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize); @@ -963,10 +1527,53 @@ void scaled_gemm( &heuristicResult, &returnedResult)); if (returnedResult == 0) { +#ifndef USE_ROCM TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); +#else + // hipblaslt might be able to recover by returning all algos + std::vector all_algos; + TORCH_CUDABLAS_CHECK(hipblaslt_ext::getAllAlgos( + ltHandle, + hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + _cublasOpFromChar(transa), + _cublasOpFromChar(transb), + HIPTOLT(ScalarTypeToCudaDataType(mat1_dtype)), + HIPTOLT(ScalarTypeToCudaDataType(mat2_dtype)), + // C is nullptr and beta=0, so set to something reasonable. See above. + //HIPTOLT(ScalarTypeToCudaDataType(bias_dtype)), + HIPTOLT(ScalarTypeToCudaDataType(result_dtype)), + HIPTOLT(ScalarTypeToCudaDataType(result_dtype)), + HIPCOMPTOLT(CUBLAS_COMPUTE_32F), + all_algos)); + if (all_algos.size() == 0) { + TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); + } + // pick first valid solution + bool found = false; + for (size_t i = 0; i < all_algos.size(); i++) { + size_t ret_workspace_size = 0; + auto is_valid_status = hipblaslt_ext::matmulIsAlgoSupported( + ltHandle, + computeDesc.descriptor(), + &alpha_val, + Adesc.descriptor(), + Bdesc.descriptor(), + &beta_val, + Cdesc.descriptor(), + Ddesc.descriptor(), + all_algos[i].algo, + ret_workspace_size); + if (is_valid_status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size <= workspaceSize) { + heuristicResult = all_algos[i]; + found = true; + break; + } + } + } + TORCH_CHECK(found, "could not find valid hipblaslt solution"); +#endif } - float alpha_val = 1.0; - float beta_val = 0.0; cublasStatus_t cublasStatus = cublasLtMatmul( ltHandle, computeDesc.descriptor(), @@ -976,7 +1583,11 @@ void scaled_gemm( mat2_ptr, Bdesc.descriptor(), &beta_val, +#ifdef USE_ROCM + result_ptr, // unused, since beta_val is 0, but hipblaslt can't handle nullptr +#else nullptr, +#endif Cdesc.descriptor(), result_ptr, Ddesc.descriptor(), @@ -1009,7 +1620,7 @@ void scaled_gemm( " scaleType ", scaleType); return; - #endif // CUDA_VERSION >= 11080 +#endif // CUDA_VERSION >= 11080 || (defined(USE_ROCM) && ROCM_VERSION >= 60000) TORCH_CHECK(false, "scaled_gemm is only supported for CUDA 11.8 and above"); } @@ -1044,11 +1655,34 @@ void int8_gemm( CuBlasLtMatrixLayout Bdesc(abType, k, n, mat2_ld, transpose_mat2); CuBlasLtMatrixLayout Cdesc(cType, m, n, result_ld); - cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); - // cublas team: alpha and beta need to be the same dtype as of scaleType at::opmath_type alpha_val = 1; int32_t beta_val = 0; + cublasLtHandle_t ltHandle = at::cuda::getCurrentCUDABlasLtHandle(); + +#ifdef USE_ROCM + CuBlasLtMatmulPreference preference; + size_t workspaceSize = _getWorkspaceSize(); + preference.setAttribute(CUBLASLT_MATMUL_PREF_MAX_WORKSPACE_BYTES, workspaceSize); + auto& allocator = *::c10::cuda::CUDACachingAllocator::get(); + auto workspace = allocator.allocate(workspaceSize); + cublasLtMatmulHeuristicResult_t heuristicResult = {}; + int returnedResult = 0; + TORCH_CUDABLAS_CHECK(cublasLtMatmulAlgoGetHeuristic( + ltHandle, + computeDesc.descriptor(), + Adesc.descriptor(), + Bdesc.descriptor(), + Cdesc.descriptor(), + Cdesc.descriptor(), + preference.descriptor(), + 1, + &heuristicResult, + &returnedResult)); + if (returnedResult == 0) { + TORCH_CUDABLAS_CHECK(CUBLAS_STATUS_NOT_SUPPORTED); + } +#endif cublasStatus_t cublasStatus = cublasLtMatmul( ltHandle, @@ -1063,9 +1697,21 @@ void int8_gemm( Cdesc.descriptor(), result_ptr, Cdesc.descriptor(), +#ifdef USE_ROCM + &heuristicResult.algo, +#else nullptr, // Heuristics don't seem to work for int8 +#endif +#ifdef USE_ROCM + workspace.mutable_get(), +#else nullptr, // Non-zero workspace doesn't seem to work. +#endif +#ifdef USE_ROCM + workspaceSize, +#else 0, +#endif at::cuda::getCurrentCUDAStream()); TORCH_CHECK( cublasStatus == CUBLAS_STATUS_SUCCESS, @@ -1099,7 +1745,7 @@ void int8_gemm( TORCH_CHECK(false, "int8_gemm is only supported for ROCm 6.0 and above"); #endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 60000) } -#endif // (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#endif // !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not. #if defined(USE_ROCM) && ROCM_VERSION < 50600 diff --git a/aten/src/ATen/cuda/CUDABlas.h b/aten/src/ATen/cuda/CUDABlas.h index ee3b41b4376a9..24aad7678ec49 100644 --- a/aten/src/ATen/cuda/CUDABlas.h +++ b/aten/src/ATen/cuda/CUDABlas.h @@ -44,6 +44,8 @@ class PointerModeGuard { const Dtype *a, int64_t lda, const Dtype *b, int64_t ldb, at::opmath_type beta,\ Dtype *c, int64_t ldc +#define CUDABLAS_GEMM_ARGS(Dtype) transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc + template inline void gemm(CUDABLAS_GEMM_ARGTYPES(Dtype)) { AT_ERROR("at::cuda::blas::gemm: not implemented for ", typeid(Dtype).name()); @@ -62,7 +64,25 @@ void gemm(CUDABLAS_GEMM_ARGTYPES(at::Half)); template <> void gemm(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +template +inline void gemm_internal(CUDABLAS_GEMM_ARGTYPES(Dtype)) { + AT_ERROR("at::cuda::blas::gemm_internal: not implemented for ", typeid(Dtype).name()); +} + +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(double)); +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(float)); +template <> +void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); +template <> +void gemm_internal>(CUDABLAS_GEMM_ARGTYPES(c10::complex)); +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::Half)); +template <> +void gemm_internal(CUDABLAS_GEMM_ARGTYPES(at::BFloat16)); + +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) enum GEMMAndBiasActivationEpilogue { None, RELU, @@ -131,6 +151,9 @@ void scaled_gemm( const Dtype *b, int64_t ldb, int64_t strideb, \ at::opmath_type beta, Dtype *c, int64_t ldc, int64_t stridec, int64_t num_batches +#define CUDABLAS_BGEMM_ARGS(Dtype) \ + transa, transb, m, n, k, alpha, a, lda, stridea, b, ldb, strideb, beta, c, ldc, stridec, num_batches + template inline void bgemm(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { AT_ERROR("at::cuda::blas::bgemm: not implemented for ", typeid(Dtype).name()); @@ -149,6 +172,24 @@ void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::Half)); template <> void bgemm(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); +template +inline void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(Dtype)) { + AT_ERROR("at::cuda::blas::bgemm_internal: not implemented for ", typeid(Dtype).name()); +} + +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(double)); +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(float)); +template <> +void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)); +template <> +void bgemm_internal>(CUDABLAS_BGEMM_ARGTYPES(c10::complex)); +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::Half)); +template <> +void bgemm_internal(CUDABLAS_BGEMM_ARGTYPES(at::BFloat16)); + #if defined(USE_ROCM) && ROCM_VERSION <= 50500 // ROCm 5.6 hipblas matches the const Dtype *A API, but prior hipblas does not. #define CUDABLAS_TRSM_ARGTYPES(Dtype) \ diff --git a/aten/src/ATen/cuda/CUDAContext.cpp b/aten/src/ATen/cuda/CUDAContext.cpp index 946bbf77497ef..ab92001f5ef0d 100644 --- a/aten/src/ATen/cuda/CUDAContext.cpp +++ b/aten/src/ATen/cuda/CUDAContext.cpp @@ -44,7 +44,7 @@ cudaDeviceProp* getCurrentDeviceProperties() { return getDeviceProperties(device); } -cudaDeviceProp* getDeviceProperties(int64_t device) { +cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device) { c10::call_once(init_flag, initCUDAContextVectors); if (device == -1) device = c10::cuda::current_device(); AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus); @@ -52,7 +52,7 @@ cudaDeviceProp* getDeviceProperties(int64_t device) { return &device_properties[device]; } -bool canDeviceAccessPeer(int64_t device, int64_t peer_device) { +bool canDeviceAccessPeer(c10::DeviceIndex device, c10::DeviceIndex peer_device) { c10::call_once(init_flag, initCUDAContextVectors); if (device == -1) device = c10::cuda::current_device(); AT_ASSERT(device >= 0 && device < num_gpus, "device=", device, ", num_gpus=", num_gpus); diff --git a/aten/src/ATen/cuda/CUDAContextLight.h b/aten/src/ATen/cuda/CUDAContextLight.h index c189ed20fd4b0..60d09dfaee169 100644 --- a/aten/src/ATen/cuda/CUDAContextLight.h +++ b/aten/src/ATen/cuda/CUDAContextLight.h @@ -9,7 +9,7 @@ // cublasLT was introduced in CUDA 10.1 but we enable only for 11.1 that also // added bf16 support -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) #include #endif @@ -71,18 +71,18 @@ TORCH_CUDA_CPP_API cudaDeviceProp* getCurrentDeviceProperties(); TORCH_CUDA_CPP_API int warp_size(); -TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(int64_t device); +TORCH_CUDA_CPP_API cudaDeviceProp* getDeviceProperties(c10::DeviceIndex device); TORCH_CUDA_CPP_API bool canDeviceAccessPeer( - int64_t device, - int64_t peer_device); + c10::DeviceIndex device, + c10::DeviceIndex peer_device); TORCH_CUDA_CPP_API c10::Allocator* getCUDADeviceAllocator(); /* Handles */ TORCH_CUDA_CPP_API cusparseHandle_t getCurrentCUDASparseHandle(); TORCH_CUDA_CPP_API cublasHandle_t getCurrentCUDABlasHandle(); -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) TORCH_CUDA_CPP_API cublasLtHandle_t getCurrentCUDABlasLtHandle(); #endif diff --git a/aten/src/ATen/cuda/CUDADataType.h b/aten/src/ATen/cuda/CUDADataType.h index 3068eb787a837..8615bcdae9117 100644 --- a/aten/src/ATen/cuda/CUDADataType.h +++ b/aten/src/ATen/cuda/CUDADataType.h @@ -31,8 +31,6 @@ template<> inline cudaDataType getCudaDataType>() { return CUDA_C_64F; } -// HIP doesn't define integral types -#ifndef USE_ROCM template<> inline cudaDataType getCudaDataType() { return CUDA_R_8U; } @@ -42,9 +40,7 @@ template<> inline cudaDataType getCudaDataType() { template<> inline cudaDataType getCudaDataType() { return CUDA_R_32I; } -#endif -#if !defined(USE_ROCM) template<> inline cudaDataType getCudaDataType() { return CUDA_R_16I; } @@ -54,19 +50,15 @@ template<> inline cudaDataType getCudaDataType() { template<> inline cudaDataType getCudaDataType() { return CUDA_R_16BF; } -#endif inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) { switch (scalar_type) { -// HIP doesn't define integral types -#ifndef USE_ROCM case c10::ScalarType::Byte: return CUDA_R_8U; case c10::ScalarType::Char: return CUDA_R_8I; case c10::ScalarType::Int: return CUDA_R_32I; -#endif case c10::ScalarType::Half: return CUDA_R_16F; case c10::ScalarType::Float: @@ -79,7 +71,6 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) return CUDA_C_32F; case c10::ScalarType::ComplexDouble: return CUDA_C_64F; -#if !defined(USE_ROCM) case c10::ScalarType::Short: return CUDA_R_16I; case c10::ScalarType::Long: @@ -92,6 +83,18 @@ inline cudaDataType ScalarTypeToCudaDataType(const c10::ScalarType& scalar_type) case c10::ScalarType::Float8_e5m2: return CUDA_R_8F_E5M2; #endif +#if defined(USE_ROCM) +#if defined(HIP_NEW_TYPE_ENUMS) + case c10::ScalarType::Float8_e4m3fnuz: + return HIP_R_8F_E4M3_FNUZ; + case c10::ScalarType::Float8_e5m2fnuz: + return HIP_R_8F_E5M2_FNUZ; +#else + case c10::ScalarType::Float8_e4m3fnuz: + return static_cast(1000); + case c10::ScalarType::Float8_e5m2fnuz: + return static_cast(1001); +#endif #endif default: TORCH_INTERNAL_ASSERT(false, "Cannot convert ScalarType ", scalar_type, " to cudaDataType.") diff --git a/aten/src/ATen/cuda/CUDAEvent.h b/aten/src/ATen/cuda/CUDAEvent.h index ca6878721406b..e3c331a9c99fa 100644 --- a/aten/src/ATen/cuda/CUDAEvent.h +++ b/aten/src/ATen/cuda/CUDAEvent.h @@ -48,9 +48,9 @@ struct TORCH_CUDA_CPP_API CUDAEvent { CUDAGuard guard(device_index_); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_deletion(reinterpret_cast(event_)); + (*interp)->trace_gpu_event_deletion(at::kCUDA, reinterpret_cast(event_)); } - cudaEventDestroy(event_); + AT_CUDA_CHECK(cudaEventDestroy(event_)); } } catch (...) { /* No throw */ } } @@ -122,7 +122,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { AT_CUDA_CHECK(cudaEventRecord(event_, stream)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_record( + (*interp)->trace_gpu_event_record(at::kCUDA, reinterpret_cast(event_), reinterpret_cast(stream.stream()) ); @@ -138,7 +138,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { AT_CUDA_CHECK(cudaStreamWaitEvent(stream, event_, 0)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_wait( + (*interp)->trace_gpu_event_wait(at::kCUDA, reinterpret_cast(event_), reinterpret_cast(stream.stream()) ); @@ -151,6 +151,10 @@ struct TORCH_CUDA_CPP_API CUDAEvent { TORCH_CHECK(is_created_ && other.isCreated(), "Both events must be recorded before calculating elapsed time."); float time_ms = 0; + // We do not strictly have to set the device index to the same as our event, + // but if we don't and the current device is not initialized, it will + // create a new cuda context, which will consume a lot of memory. + CUDAGuard guard(device_index_); // raise cudaErrorNotReady if either event is recorded but not yet completed AT_CUDA_CHECK(cudaEventElapsedTime(&time_ms, event_, other.event_)); return time_ms; @@ -161,7 +165,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { if (is_created_) { const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_synchronization(reinterpret_cast(event_)); + (*interp)->trace_gpu_event_synchronization(at::kCUDA, reinterpret_cast(event_)); } AT_CUDA_CHECK(cudaEventSynchronize(event_)); } @@ -191,7 +195,7 @@ struct TORCH_CUDA_CPP_API CUDAEvent { AT_CUDA_CHECK(cudaEventCreateWithFlags(&event_, flags_)); const c10::impl::PyInterpreter* interp = c10::impl::GPUTrace::get_trace(); if (C10_UNLIKELY(interp)) { - (*interp)->trace_gpu_event_creation(reinterpret_cast(event_)); + (*interp)->trace_gpu_event_creation(at::kCUDA, reinterpret_cast(event_)); } is_created_ = true; } diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp index b8004ec7e7e37..7e19ce98fbf9d 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.cpp @@ -1,10 +1,13 @@ +#include +#include #include #include +#include #include #include #include #include -#include +#include namespace at { namespace cuda::detail { @@ -24,10 +27,10 @@ static std::deque cuda_gens_init_flag; static std::vector default_gens_cuda; /* -* Populates the global variables related to CUDA generators -* Warning: this function must only be called once! -*/ -static void initCUDAGenVector(){ + * Populates the global variables related to CUDA generators + * Warning: this function must only be called once! + */ +static void initCUDAGenVector() { num_gpus = c10::cuda::device_count(); cuda_gens_init_flag.resize(num_gpus); default_gens_cuda.resize(num_gpus); @@ -77,6 +80,150 @@ Generator createCUDAGenerator(DeviceIndex device_index) { } // namespace cuda::detail +/** + * Creates a clone of this CUDA Generator State. + */ +c10::intrusive_ptr CUDAGeneratorState::clone() { + return make_intrusive( + seed_, philox_offset_per_thread_, offset_intragraph_); +} + +/** + * Function to increase the internal offset based on the specified increment. + */ +void CUDAGeneratorState::increase(uint64_t increment) { + // Rounds increment up to the nearest multiple of 4 to meet alignment + // requirements. + // see Note [Why enforce RNG offset % 4 == 0?] + increment = ((increment + 3) / 4) * 4; + // Handling different behaviors based on whether capturing is active. + if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { + // Ensures that the state is actually capturing. + TORCH_CHECK( + capturing_, + "Attempt to increase offset for a CUDA generator not in capture mode."); + // Ensures the offset is a multiple of 4 + // see Note [Why enforce RNG offset % 4 == 0?] + TORCH_INTERNAL_ASSERT( + offset_intragraph_ % 4 == 0, "RNG offset must be a multiple of 4."); + // Ensures the increment does not cause overflow. + TORCH_INTERNAL_ASSERT( + offset_intragraph_ <= std::numeric_limits::max() - increment, + "Increment causes overflow in the offset value."); + offset_intragraph_ += increment; + } else { + // Checks that the increment is expected outside graph capturing. + TORCH_CHECK( + !capturing_, + "Offset increment outside graph capture encountered unexpectedly."); + // Ensures the offset is a multiple of 4 + // see Note [Why enforce RNG offset % 4 == 0?] + TORCH_INTERNAL_ASSERT( + philox_offset_per_thread_ % 4 == 0, + "RNG offset must be a multiple of 4."); + philox_offset_per_thread_ += increment; + } +} + +/** + * Registers this state to a CUDA graph to manage within the graph. + */ +void CUDAGeneratorState::register_graph(cuda::CUDAGraph* graph) { + // Ensures that the RNG state is not currently being captured. + at::cuda::assertNotCapturing( + "Cannot register the state during capturing stage."); + + // If this is the first graph to be registered, allocate memory for the seed + // and offset on the GPU. + if (registered_graphs_.empty()) { + auto options = at::TensorOptions().device(at::kCUDA).dtype(at::kLong); + seed_extragraph_ = at::empty({1}, options); + offset_extragraph_ = at::empty({1}, options); + } + + // Insert the graph into the set of registered graphs if it's not already + // registered. + if (registered_graphs_.find(graph) == registered_graphs_.end()) { + registered_graphs_.insert(graph); + } +} + +/** + * Unregisters a CUDA graph from the RNG state. + */ +void CUDAGeneratorState::unregister_graph(cuda::CUDAGraph* graph) { + // Ensures that the RNG state is not currently being captured. + at::cuda::assertNotCapturing( + "Cannot unregister the state during capturing stage."); + // Verify the graph was previously registered. + TORCH_CHECK( + registered_graphs_.find(graph) != registered_graphs_.end(), + "The graph should be registered to the state"); + + // Remove the graph from the set of registered graphs. + registered_graphs_.erase(graph); + + // If no more graphs are registered, deallocate the GPU memory for the seed + // and offset. + if (registered_graphs_.empty()) { + seed_extragraph_.reset(); + offset_extragraph_.reset(); + } +} + +/** + * Note [Explicit Registration of Generators to the CUDA Graph] + * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + * + * Ideally, it would be more user-friendly if the state could be exchanged and generators + * could be registered with the CUDA graph implicitly. However, resetting GPU tensors during + * the capture stage causes these reset operations to be recorded within the CUDA graph. + * This behavior is undesirable because we do not want these tensors to be reset during + * the replay stage of the graph. + * + * As of now, there is no available method to perform a CUDA operation during the graph's + * recording phase without having that operation be included in the CUDA graph. + * This limitation necessitates explicit user action to register generators with the graph. + * By requiring users to manually register their generators, we can ensure that state resets + * (capture_prologue) only occur before the graph capture begins, thus avoiding unintended + * resets during the replay of the graph. See https://github.com/pytorch/pytorch/pull/114068. + */ + +/** + * Performs the prologue steps for capturing a CUDA graph state. + * This method is intended to reset graph-related state variables before capturing begins. + */ +void CUDAGeneratorState::capture_prologue() { + capturing_ = true; + offset_intragraph_ = 0; + seed_extragraph_.fill_(int64_t(seed_)); + offset_extragraph_.fill_(int64_t(0)); +} + +/** + * Ends the capturing phase and resets related variables, returning the whole + * graph increment. + */ +uint64_t CUDAGeneratorState::capture_epilogue() { + capturing_ = false; + return offset_intragraph_; +} + +/** + * Prepares the state for replay by setting initial state tensors and applying + * total increment. + */ +void CUDAGeneratorState::replay_prologue(uint64_t wholegraph_increment) { + // Ensures the generator is not in capturing mode. + at::cuda::assertNotCapturing( + "Cannot prepare for replay during capturing stage."); + seed_extragraph_.fill_(int64_t(seed_)); + offset_extragraph_.fill_(int64_t(philox_offset_per_thread_)); + // Applies the total increment achieved during previous captures to update the + // offset. + increase(wholegraph_increment); +} + /** * Note [Why enforce RNG offset % 4 == 0?] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -97,8 +244,18 @@ Generator createCUDAGenerator(DeviceIndex device_index) { */ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index) : c10::GeneratorImpl{Device(DeviceType::CUDA, device_index), - DispatchKeySet(c10::DispatchKey::CUDA)} { + DispatchKeySet(c10::DispatchKey::CUDA)} { at::cuda::assertNotCapturing("Cannot construct a new CUDAGeneratorImpl"); + state_ = make_intrusive(); + no_reset_rnn_state_.clear(); +} + +CUDAGeneratorImpl::CUDAGeneratorImpl( + DeviceIndex device_index, + c10::intrusive_ptr state) + : c10:: + GeneratorImpl{Device(DeviceType::CUDA, device_index), DispatchKeySet(c10::DispatchKey::CUDA)}, + state_(std::move(state)) { no_reset_rnn_state_.clear(); } @@ -109,9 +266,10 @@ CUDAGeneratorImpl::CUDAGeneratorImpl(DeviceIndex device_index) * See Note [Acquire lock when using random generators] */ void CUDAGeneratorImpl::set_current_seed(uint64_t seed) { - at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_current_seed"); - seed_ = seed; - philox_offset_per_thread_ = 0; + at::cuda::assertNotCapturing( + "Cannot call CUDAGeneratorImpl::set_current_seed"); + state_->seed_ = seed; + state_->philox_offset_per_thread_ = 0; no_reset_rnn_state_.clear(); } @@ -134,15 +292,9 @@ uint64_t CUDAGeneratorImpl::get_offset() const { // Debatable if get_offset() should be allowed in captured regions. // Conservatively disallow it for now. at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::get_offset"); - return philox_offset_per_thread_; + return state_->philox_offset_per_thread_; } -#define CAPTURE_DEFAULT_GENS_MSG \ -"In regions captured by CUDA graphs, you may only use the default CUDA RNG " \ -"generator on the device that's current when capture begins. " \ -"If you need a non-default (user-supplied) generator, or a generator on another " \ -"device, please file an issue." - /** * Gets the current seed of CUDAGeneratorImpl. */ @@ -150,7 +302,7 @@ uint64_t CUDAGeneratorImpl::current_seed() const { // Debatable if current_seed() should be allowed in captured regions. // Conservatively disallow it for now. at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::current_seed"); - return seed_; + return state_->seed_; } /** @@ -194,6 +346,8 @@ c10::intrusive_ptr CUDAGeneratorImpl::get_state() const { * and size of the internal state. */ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { + at::cuda::assertNotCapturing( + "Please ensure to utilize the CUDAGeneratorImpl::set_state_index method during capturing."); static const size_t seed_size = sizeof(uint64_t); static const size_t offset_size = sizeof(int64_t); static const size_t total_size = seed_size + offset_size; @@ -208,7 +362,7 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { TORCH_CHECK(new_state_size == total_size, "RNG state is wrong size"); } - uint64_t input_seed; + uint64_t input_seed = 0; auto new_rng_state = new_state.data_dtype_initialized(); memcpy(&input_seed, new_rng_state, seed_size); this->set_current_seed(input_seed); @@ -219,44 +373,59 @@ void CUDAGeneratorImpl::set_state(const c10::TensorImpl& new_state) { this->set_philox_offset_per_thread(static_cast(philox_offset)); } +/** + * Sets the generator's current state to + * This function allows switching between different registered states of + * the generator. + */ +void CUDAGeneratorImpl::graphsafe_set_state( + const c10::intrusive_ptr& gen) { + c10::intrusive_ptr cuda_gen = + dynamic_intrusive_pointer_cast(gen); + TORCH_CHECK(cuda_gen, "Expected a CUDA Generator"); + state_ = cuda_gen->state_; +} + +/** + * Get the GeneratorImpl that point to current state_ + */ +c10::intrusive_ptr CUDAGeneratorImpl::graphsafe_get_state() + const { + auto gen = make_intrusive(device().index(), state_); + return gen; +} + /** * Sets the philox_offset_per_thread_ to be used by curandStatePhilox4_32_10 * * See Note [Acquire lock when using random generators] */ void CUDAGeneratorImpl::set_philox_offset_per_thread(uint64_t offset) { - at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::set_philox_offset_per_thread"); // see Note [Why enforce RNG offset % 4 == 0?] TORCH_CHECK(offset % 4 == 0, "offset must be a multiple of 4"); - philox_offset_per_thread_ = offset; + state_->philox_offset_per_thread_ = offset; } /** * Gets the current philox_offset_per_thread_ of CUDAGeneratorImpl. */ uint64_t CUDAGeneratorImpl::philox_offset_per_thread() const { - at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::philox_offset_per_thread"); - return philox_offset_per_thread_; + return state_->philox_offset_per_thread_; } /** - * Called by CUDAGraph to prepare this instance for a graph capture region. - * offset_extragraph is the initial offset at the start of the graphed region. - * offset_intragraph tracks the offset in the graphed region. + * Registers this state to a CUDA graph to manage within the graph. */ -void CUDAGeneratorImpl::capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph) { - seed_extragraph_ = seed_extragraph; - offset_extragraph_ = offset_extragraph; - offset_intragraph_ = 0; - graph_expects_this_gen_ = true; +void CUDAGeneratorImpl::register_graph(cuda::CUDAGraph* graph) { + graph->register_generator_state(state_); + state_->register_graph(graph); } /** - * Called by CUDAGraph to finalize a graph capture region for this instance. + * Unregisters a CUDA graph from the RNG state. */ -uint64_t CUDAGeneratorImpl::capture_epilogue() { - graph_expects_this_gen_ = false; - return offset_intragraph_; +void CUDAGeneratorImpl::unregister_graph(cuda::CUDAGraph* graph) { + state_->unregister_graph(graph); } /** @@ -281,30 +450,17 @@ uint64_t CUDAGeneratorImpl::capture_epilogue() { * See Note [Acquire lock when using random generators] */ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { - // rounds increment up to the nearest multiple of 4 - increment = ((increment + 3) / 4) * 4; if (at::cuda::currentStreamCaptureStatus() != at::cuda::CaptureStatus::None) { - TORCH_CHECK(graph_expects_this_gen_, - "philox_cuda_state for an unexpected CUDA generator used during capture. " - CAPTURE_DEFAULT_GENS_MSG); - // see Note [Why enforce RNG offset % 4 == 0?] - TORCH_INTERNAL_ASSERT(this->offset_intragraph_ % 4 == 0); - uint32_t offset = this->offset_intragraph_; - TORCH_INTERNAL_ASSERT(this->offset_intragraph_ <= - std::numeric_limits::max() - increment); - this->offset_intragraph_ += increment; - return PhiloxCudaState(this->seed_extragraph_, - this->offset_extragraph_, - offset); + uint32_t offset = state_->offset_intragraph_; + state_->increase(increment); + return PhiloxCudaState( + state_->seed_extragraph_.data_ptr(), + state_->offset_extragraph_.data_ptr(), + offset); } else { - TORCH_CHECK(!graph_expects_this_gen_, - "CUDA generator expects graph capture to be underway, " - "but the current stream is not capturing."); - // see Note [Why enforce RNG offset % 4 == 0?] - TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0); - uint64_t offset = this->philox_offset_per_thread_; - this->philox_offset_per_thread_ += increment; - return PhiloxCudaState(this->seed_, offset); + uint64_t offset = state_->philox_offset_per_thread_; + state_->increase(increment); + return PhiloxCudaState(state_->seed_, offset); } } @@ -312,16 +468,13 @@ PhiloxCudaState CUDAGeneratorImpl::philox_cuda_state(uint64_t increment) { * Temporarily accommodates call sites that use philox_engine_inputs. * Allows incremental refactor of call sites to use philox_cuda_state. */ -std::pair CUDAGeneratorImpl::philox_engine_inputs(uint64_t increment) { - at::cuda::assertNotCapturing("Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. " - "Cannot call CUDAGeneratorImpl::philox_engine_inputs"); - // rounds increment up to the nearest multiple of 4 - increment = ((increment + 3) / 4) * 4; - // see Note [Why enforce RNG offset % 4 == 0?] - TORCH_INTERNAL_ASSERT(this->philox_offset_per_thread_ % 4 == 0); - uint64_t offset = this->philox_offset_per_thread_; - this->philox_offset_per_thread_ += increment; - return std::make_pair(this->seed_, offset); +std::pair CUDAGeneratorImpl::philox_engine_inputs( + uint64_t increment) { + at::cuda::assertNotCapturing( + "Refactor this op to use CUDAGeneratorImpl::philox_cuda_state. Cannot call CUDAGeneratorImpl::philox_engine_inputs"); + uint64_t offset = state_->philox_offset_per_thread_; + state_->increase(increment); + return std::make_pair(state_->seed_, offset); } /* @@ -348,9 +501,7 @@ std::shared_ptr CUDAGeneratorImpl::clone() const { */ CUDAGeneratorImpl* CUDAGeneratorImpl::clone_impl() const { at::cuda::assertNotCapturing("Cannot call CUDAGeneratorImpl::clone_impl"); - auto gen = new CUDAGeneratorImpl(this->device().index()); - gen->set_current_seed(this->seed_); - gen->set_philox_offset_per_thread(this->philox_offset_per_thread_); + auto gen = new CUDAGeneratorImpl(this->device().index(), state_->clone()); return gen; } diff --git a/aten/src/ATen/cuda/CUDAGeneratorImpl.h b/aten/src/ATen/cuda/CUDAGeneratorImpl.h index 2fe8a6f6c8f4f..0fe664e35f54c 100644 --- a/aten/src/ATen/cuda/CUDAGeneratorImpl.h +++ b/aten/src/ATen/cuda/CUDAGeneratorImpl.h @@ -1,12 +1,19 @@ #pragma once +#include #include +#include #include -#include -#include #include - +#include +#include +#include namespace at { + +namespace cuda { +struct CUDAGraph; +} + /** * Note [CUDA Graph-safe RNG states] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -87,9 +94,41 @@ namespace at { * */ +struct CUDAGeneratorState : public c10::intrusive_ptr_target { + uint64_t seed_; + uint64_t philox_offset_per_thread_; + uint32_t offset_intragraph_; + bool capturing_{}; + std::unordered_set registered_graphs_; + at::TensorBase seed_extragraph_{}; + at::TensorBase offset_extragraph_{}; + + CUDAGeneratorState( + uint64_t seed = default_rng_seed_val, + uint64_t philox_offset_per_thread = 0, + uint32_t offset_intragraph = 0) + : seed_(seed), + philox_offset_per_thread_(philox_offset_per_thread), + offset_intragraph_(offset_intragraph) {} + + void increase(uint64_t increment); + + void register_graph(cuda::CUDAGraph* graph); + void unregister_graph(cuda::CUDAGraph* graph); + + void capture_prologue(); + // capture_epilogue returns the wholegraph_increment + uint64_t capture_epilogue(); + void replay_prologue(uint64_t wholegraph_increment); + c10::intrusive_ptr clone(); +}; + struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { // Constructors CUDAGeneratorImpl(DeviceIndex device_index = -1); + CUDAGeneratorImpl( + DeviceIndex device_index, + c10::intrusive_ptr state_); ~CUDAGeneratorImpl() override = default; // CUDAGeneratorImpl methods @@ -101,10 +140,18 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { uint64_t seed() override; void set_state(const c10::TensorImpl& new_state) override; c10::intrusive_ptr get_state() const override; + void graphsafe_set_state( + const c10::intrusive_ptr& state) override; + c10::intrusive_ptr graphsafe_get_state() const override; + void set_philox_offset_per_thread(uint64_t offset); uint64_t philox_offset_per_thread() const; - void capture_prologue(int64_t* seed_extragraph, int64_t* offset_extragraph); - uint64_t capture_epilogue(); + + void register_graph(cuda::CUDAGraph* graph); + void unregister_graph(cuda::CUDAGraph* graph); + + // Generates a PhiloxCudaState with a specified increment, and increment + // current state PhiloxCudaState philox_cuda_state(uint64_t increment); bool reset_rnn_state() { @@ -117,14 +164,10 @@ struct TORCH_CUDA_CPP_API CUDAGeneratorImpl : public c10::GeneratorImpl { static c10::DeviceType device_type(); -private: + private: CUDAGeneratorImpl* clone_impl() const override; - uint64_t seed_ = default_rng_seed_val; - uint64_t philox_offset_per_thread_ = 0; - int64_t* seed_extragraph_{}; - int64_t* offset_extragraph_{}; - uint32_t offset_intragraph_ = 0; - bool graph_expects_this_gen_ = false; + + c10::intrusive_ptr state_; std::atomic_flag no_reset_rnn_state_; }; diff --git a/aten/src/ATen/cuda/CUDAGraph.cpp b/aten/src/ATen/cuda/CUDAGraph.cpp index 1093426c983b6..436408f88a519 100644 --- a/aten/src/ATen/cuda/CUDAGraph.cpp +++ b/aten/src/ATen/cuda/CUDAGraph.cpp @@ -6,7 +6,10 @@ #include #include +#include +#include #include +#include namespace at::cuda { @@ -86,26 +89,33 @@ CUDAGraph::CUDAGraph() #endif } +void CUDAGraph::register_generator_state( + c10::intrusive_ptr state) { + captured_generator_states_[std::move(state)] = 0; +} + +void CUDAGraph::register_generator_state(const at::Generator& generator) { + c10::intrusive_ptr cuda_gen = + dynamic_intrusive_pointer_cast( + generator.getIntrusivePtr()); + cuda_gen->register_graph(this); +} + void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capture_mode) { #if !defined(USE_ROCM) || ROCM_VERSION >= 50300 TORCH_CHECK(!has_graph_exec_, "This CUDAGraph instance already owns a captured graph. " "To capture a new graph, create a new instance."); - // For now, a CUDAGraph instance only accommodates the default generator on the device that's - // current when capture begins. If any op in the captured region uses a non-default generator, - // or a generator on another device, the offending generator will throw an error. - // These restrictions simplify CUDAGraph, but could be relaxed in the future: - // in principle, the underlying Cuda calls do permit cross-device ops to be captured. + // default generator is always registered auto* gen = get_generator_or_default( c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); + gen->register_graph(this); - auto options = TensorOptions().device(at::kCUDA).dtype(at::kLong); - seed_extragraph_ = at::empty({1}, options); - offset_extragraph_ = at::empty({1}, options); - - seed_extragraph_.fill_(int64_t(gen->current_seed())); - gen->capture_prologue(seed_extragraph_.data_ptr(), offset_extragraph_.mutable_data_ptr()); + for (auto& [generator_state, wholegraph_increments] : + captured_generator_states_) { + generator_state->capture_prologue(); + } auto stream = at::cuda::getCurrentCUDAStream(); @@ -115,7 +125,6 @@ void CUDAGraph::capture_begin(MempoolId_t pool/*=0*/, cudaStreamCaptureMode capt "default stream.)"); capture_stream_ = stream; - capture_gen_ = gen; capture_dev_ = c10::cuda::current_device(); id_ = capture_sequence_id(); @@ -215,13 +224,10 @@ void CUDAGraph::capture_end() { has_graph_exec_ = true; - auto* gen = get_generator_or_default( - c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); - TORCH_CHECK(gen == capture_gen_, - "Default CUDA RNG generator on current device at capture end " - "is different from default generator on current device " - "when capture began"); - wholegraph_increment_ = gen->capture_epilogue(); + for (auto& [generator_state, wholegraph_increments] : + captured_generator_states_) { + wholegraph_increments = generator_state->capture_epilogue(); + } size_t numCUDAGraphNodes = 0; AT_CUDA_CHECK(cudaGraphGetNodes(graph_, NULL, &numCUDAGraphNodes)); @@ -251,17 +257,10 @@ void CUDAGraph::replay() { c10::OptionalDeviceGuard device_guard{capture_stream_.device()}; - // Just like any RNG consumer kernel! - auto* gen = get_generator_or_default( - c10::nullopt, cuda::detail::getDefaultCUDAGenerator()); - PhiloxCudaState rng_engine_inputs; - { - std::lock_guard lock(gen->mutex_); - rng_engine_inputs = gen->philox_cuda_state(wholegraph_increment_); + for (auto& [generator_state, wholegraph_increments] : + captured_generator_states_) { + generator_state->replay_prologue(wholegraph_increments); } - seed_extragraph_.fill_(int64_t(gen->current_seed())); - offset_extragraph_.fill_(int64_t(rng_engine_inputs.offset_.val)); - // graph_exec_ may be replayed in any stream. AT_CUDA_CHECK(cudaGraphLaunch(graph_exec_, at::cuda::getCurrentCUDAStream())); @@ -355,6 +354,10 @@ TORCH_CHECK(has_graph_exec_, } CUDAGraph::~CUDAGraph() { + for (auto& [generator_state, wholegraph_increments] : + captured_generator_states_) { + generator_state->unregister_graph(this); + } reset(); } diff --git a/aten/src/ATen/cuda/CUDAGraph.h b/aten/src/ATen/cuda/CUDAGraph.h index b395de9a252a7..3acdad18b0eee 100644 --- a/aten/src/ATen/cuda/CUDAGraph.h +++ b/aten/src/ATen/cuda/CUDAGraph.h @@ -4,12 +4,13 @@ #include #include #include - -#include +#include namespace at { +struct Generator; struct CUDAGeneratorImpl; +struct CUDAGeneratorState; namespace cuda { @@ -24,7 +25,12 @@ struct TORCH_CUDA_CPP_API CUDAGraph { static void inc_pending_event_queries(); static void dec_pending_event_queries(); static int num_pending_event_queries(); - void capture_begin(MempoolId_t pool={0, 0}, cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal); + // See Note [Explicit Registration of Generators to the CUDA Graph] + void register_generator_state(c10::intrusive_ptr state); + void register_generator_state(const at::Generator& generator); + void capture_begin( + MempoolId_t pool = {0, 0}, + cudaStreamCaptureMode capture_mode = cudaStreamCaptureModeGlobal); void capture_end(); void replay(); void reset(); @@ -32,7 +38,7 @@ struct TORCH_CUDA_CPP_API CUDAGraph { void enable_debug_mode(); void debug_dump(const std::string& debug_path); - protected: + protected: #if !defined(USE_ROCM) || ROCM_VERSION >= 50300 cudaGraph_t graph_ = NULL; cudaGraphExec_t graph_exec_ = NULL; @@ -73,19 +79,16 @@ struct TORCH_CUDA_CPP_API CUDAGraph { // Stream on which capture began at::cuda::CUDAStream capture_stream_; - // Default generator on device where capture began - at::CUDAGeneratorImpl* capture_gen_; + // multiple generator states and their wholegraph_increments in this graph + // that are managed by the CUDA Graph + ska::flat_hash_map, uint64_t> + captured_generator_states_; // Device where capture occurred. Right now, for simplicity, we require all ops // in a capture to run on the same device, but this is a limitation of CUDAGraph, // not CUDA itself. We can straightforwardly modify CUDAGraph to support multi-device // captures if needed. int capture_dev_; - - // RNG state trackers - at::Tensor seed_extragraph_; - at::Tensor offset_extragraph_; - uint64_t wholegraph_increment_; }; } // namespace cuda diff --git a/aten/src/ATen/cuda/CUDASparse.h b/aten/src/ATen/cuda/CUDASparse.h index 0d4520938291c..1052469ea7d8a 100644 --- a/aten/src/ATen/cuda/CUDASparse.h +++ b/aten/src/ATen/cuda/CUDASparse.h @@ -30,33 +30,19 @@ #endif #if defined(USE_ROCM) - // hipSparse const API added in v2.4.0 #if HIPSPARSE_VERSION >= 200400 #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 1 -#define AT_USE_HIPSPARSE_GENERIC_52_API() 0 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 #define AT_USE_HIPSPARSE_GENERIC_API() 1 #else #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 - -// hipSparse Generic API ROCm 5.2 -#if ROCM_VERSION >= 50200 -#define AT_USE_HIPSPARSE_GENERIC_52_API() 1 -#else -#define AT_USE_HIPSPARSE_GENERIC_52_API() 0 -#endif - -// hipSparse Generic API ROCm 5.1 -#if ROCM_VERSION >= 50100 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 1 #define AT_USE_HIPSPARSE_GENERIC_API() 1 -#else -#define AT_USE_HIPSPARSE_GENERIC_API() 0 #endif - -#endif // HIPSPARSE_VERSION >= 200400 #else // USE_ROCM #define AT_USE_HIPSPARSE_CONST_DESCRIPTORS() 0 -#define AT_USE_HIPSPARSE_GENERIC_52_API() 0 +#define AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() 0 #define AT_USE_HIPSPARSE_GENERIC_API() 0 #endif // USE_ROCM diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp index e01663b3f28c9..3004eb142684f 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.cpp +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.cpp @@ -7,6 +7,10 @@ namespace at::cuda::sparse { +cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr) { + return cusparseDestroyDnMat(const_cast(dnMatDescr)); +} + #if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() namespace { @@ -51,8 +55,8 @@ cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type) { } } -#if AT_USE_HIPSPARSE_GENERIC_52_API() || AT_USE_CUSPARSE_GENERIC_API() -CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) { +#if AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() +cusparseDnMatDescr_t createRawDnMatDescriptor(const Tensor& input, int64_t batch_offset, bool is_const=false) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(input.layout() == kStrided); IntArrayRef input_strides = input.strides(); IntArrayRef input_sizes = input.sizes(); @@ -79,12 +83,16 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba #endif auto batch_stride = ndim > 2 && batch_offset >= 0 ? input_strides[ndim - 3] : 0; - void* values_ptr = static_cast(input.data_ptr()) + + void* data_ptr = is_const ? const_cast(input.const_data_ptr()) : input.data_ptr(); + void* values_ptr = static_cast(data_ptr) + batch_offset * batch_stride * input.itemsize(); cudaDataType value_type = ScalarTypeToCudaDataType(input.scalar_type()); check_supported_cuda_type(value_type); + // NOTE: Ideally, in the const case, we would use cusparseConstDnMatDescr_t + // and cusparseCreateConstDnMat, but those were introduced in CUDA 12, and we + // still need to support CUDA 11 cusparseDnMatDescr_t raw_descriptor; TORCH_CUDASPARSE_CHECK(cusparseCreateDnMat( &raw_descriptor, @@ -101,10 +109,17 @@ CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t ba TORCH_CUDASPARSE_CHECK(cusparseDnMatSetStridedBatch( raw_descriptor, batch_count, input_strides[ndim - 3])); } + return raw_descriptor; +} - descriptor_.reset(raw_descriptor); +CuSparseDnMatDescriptor::CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset) { + descriptor_.reset(createRawDnMatDescriptor(input, batch_offset)); +} + +CuSparseConstDnMatDescriptor::CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset) { + descriptor_.reset(createRawDnMatDescriptor(input, batch_offset, /*is_const*/true)); } -#endif // AT_USE_HIPSPARSE_GENERIC_52_API() || AT_USE_CUSPARSE_GENERIC_API() +#endif // AT_USE_CUSPARSE_GENERIC_API() || AT_USE_HIPSPARSE_GENERIC_API() CuSparseDnVecDescriptor::CuSparseDnVecDescriptor(const Tensor& input) { // cuSPARSE doesn't support batched vectors @@ -175,7 +190,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6 value_type // data type of values )); -#if AT_USE_HIPSPARSE_GENERIC_52_API() || !defined(USE_ROCM) if (ndim == 3 && batch_offset == -1) { int batch_count = at::native::cuda_int_cast(at::native::batchCount(input), "batch_count"); @@ -197,9 +211,6 @@ CuSparseSpMatCsrDescriptor::CuSparseSpMatCsrDescriptor(const Tensor& input, int6 cusparseCsrSetStridedBatch(raw_descriptor, batch_count, 0, 0)); } } -#else - TORCH_CHECK(ndim == 2, "Experimental support for batched CSR matrices is implemented only for CUDA 11+"); -#endif descriptor_.reset(raw_descriptor); } diff --git a/aten/src/ATen/cuda/CUDASparseDescriptors.h b/aten/src/ATen/cuda/CUDASparseDescriptors.h index 03958b1d404b9..9e3d50f34e77b 100644 --- a/aten/src/ATen/cuda/CUDASparseDescriptors.h +++ b/aten/src/ATen/cuda/CUDASparseDescriptors.h @@ -73,6 +73,10 @@ using bsrsm2Info = std::remove_pointer::type; #endif #endif +// NOTE: This is only needed for CUDA 11 and earlier, since CUDA 12 introduced +// API for const descriptors +cusparseStatus_t destroyConstDnMat(const cusparseDnMatDescr* dnMatDescr); + class TORCH_CUDA_CPP_API CuSparseMatDescriptor : public CuSparseDescriptor { public: @@ -123,14 +127,25 @@ class TORCH_CUDA_CPP_API CuSparseBsrsm2Info cusparseIndexType_t getCuSparseIndexType(const c10::ScalarType& scalar_type); -#if AT_USE_HIPSPARSE_GENERIC_52_API() || \ - (AT_USE_CUSPARSE_GENERIC_API() && AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS()) +#if AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_NON_CONST_DESCRIPTORS() class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor : public CuSparseDescriptor { public: explicit CuSparseDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); }; +class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor + : public CuSparseDescriptor { + public: + explicit CuSparseConstDnMatDescriptor(const Tensor& input, int64_t batch_offset = -1); + cusparseDnMatDescr* unsafe_mutable_descriptor() const { + return const_cast(descriptor()); + } + cusparseDnMatDescr* unsafe_mutable_descriptor() { + return const_cast(descriptor()); + } +}; + class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor : public CuSparseDescriptor { public: @@ -140,8 +155,6 @@ class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor : public CuSparseDescriptor {}; -//AT_USE_HIPSPARSE_GENERIC_52_API() || (AT_USE_CUSPARSE_GENERIC_API() && AT_USE_CUSPARSE_NON_CONST_DESCRIPTORS()) - #elif AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() class TORCH_CUDA_CPP_API CuSparseDnMatDescriptor : public ConstCuSparseDescriptor< @@ -153,6 +166,22 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor int64_t batch_offset = -1); }; + class TORCH_CUDA_CPP_API CuSparseConstDnMatDescriptor + : public ConstCuSparseDescriptor< + const cusparseDnMatDescr, + &destroyConstDnMat> { + public: + explicit CuSparseConstDnMatDescriptor( + const Tensor& input, + int64_t batch_offset = -1); + cusparseDnMatDescr* unsafe_mutable_descriptor() const { + return const_cast(descriptor()); + } + cusparseDnMatDescr* unsafe_mutable_descriptor() { + return const_cast(descriptor()); + } + }; + class TORCH_CUDA_CPP_API CuSparseDnVecDescriptor : public ConstCuSparseDescriptor< cusparseDnVecDescr, @@ -165,7 +194,7 @@ class TORCH_CUDA_CPP_API CuSparseSpMatDescriptor : public ConstCuSparseDescriptor< cusparseSpMatDescr, &cusparseDestroySpMat> {}; -#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() +#endif // AT_USE_CUSPARSE_CONST_DESCRIPTORS() || AT_USE_HIPSPARSE_CONST_DESCRIPTORS() class TORCH_CUDA_CPP_API CuSparseSpMatCsrDescriptor : public CuSparseSpMatDescriptor { diff --git a/aten/src/ATen/cuda/CachingHostAllocator.cpp b/aten/src/ATen/cuda/CachingHostAllocator.cpp index 22dbb661f18b4..f4f22711d61a3 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.cpp +++ b/aten/src/ATen/cuda/CachingHostAllocator.cpp @@ -8,34 +8,11 @@ #include #include -#include -#include #include -#include -#include -#include -#include -#include -#include namespace at::cuda { namespace { -struct BlockSize { - size_t size_{0}; - void* ptr_{nullptr}; -}; - -struct Block { - size_t size_{0}; - void* ptr_{nullptr}; - - std::mutex mutex_; - bool allocated_{false}; - size_t event_count_{0}; - std::unordered_set streams_; -}; - // Note: cudaEventCreate when concurrently invoked from multiple threads can be // very expensive (at least on certain device/driver combinations). Thus, we a) // serialize event creation at a per-device level, and b) pool the events to @@ -89,81 +66,12 @@ class EventPool { std::vector pools_; }; -// Used for heterogenous lookup support in the free list. -struct BlockComparator { - using is_transparent = void; - bool operator()(const Block* a, const Block* b) const { - if (a->size_ != b->size_) { - return a->size_ < b->size_; - } - return (uintptr_t)a->ptr_ < (uintptr_t)b->ptr_; - } - - // Transparent overloads - bool operator()(const Block* a, BlockSize b) const { - if (a->size_ != b.size_) { - return a->size_ < b.size_; - } - return (uintptr_t)a->ptr_ < (uintptr_t)b.ptr_; - } - bool operator()(BlockSize a, const Block* b) const { - if (a.size_ != b->size_) { - return a.size_ < b->size_; - } - return (uintptr_t)a.ptr_ < (uintptr_t)b->ptr_; - } -}; - -/** - * Note [CUDAHostAllocator design] - * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * We have three key data structures - the free list which stores blocks that - * are not currently used, the block list which stores all blocks that have been - * allocated, and the event queue which stores CUDA events and their - * corresponding blocks. - * - * Each of these are protected by a separate mutex. The key design principles - * are to 1) only hold each mutex for the minimal amount of time possible, 2) - * never do any possible expensive operations (such as CUDA runtime API calls) - * while holding the lock. - * - * There are three public methods: allocate, free, and record_event. In the - * allocate path, we first check to see if we can service our request from this - * free list, and otherwise we create a new block with cudaHostAlloc. In the - * free path, we insert events (if required) into the event queue, and if - * possible insert our block back into the free list. In allocate, we first - * eagerly query events until we find one that is not ready, and insert the - * corresponding block onto the free list if all the events recorded for a - * block are ready. In the record_event path, we simply insert the given - * stream into the set of streams tracked by the specified block. This set of - * streams is then consumed in the free path. - * - * Some of the invariants here are less strict than they could be - for example, - * we do not enforce that free(Block* block) => block->event_count == 0. This is - * for compatibility reasons, and we can explore enforcing these in subsequent - * versions. - */ -class CUDAHostAllocator { - public: - std::pair allocate(size_t size) { - if (size == 0) { - return {nullptr, nullptr}; - } +using Block = HostBlock; - process_events(); - - // First, try to allocate from the free list - { - std::lock_guard g(free_list_mutex_); - auto it = free_list_.lower_bound(BlockSize{size, nullptr}); - if (it != free_list_.end()) { - auto block = *it; - block->allocated_ = true; - free_list_.erase(it); - return {block->ptr_, reinterpret_cast(block)}; - } - } - // Then, create a new block. +struct CUDACachingHostAllocatorImpl + : public CachingHostAllocatorImpl { + private: + void allocate_host_memory(size_t size, void** ptr) override { // Pinned memory pointers allocated by any device can be directly used by // any other device, regardless of the current device at the time of // allocation, since we assume unified addressing. So we grab any existing @@ -176,188 +84,49 @@ class CUDAHostAllocator { at::Device(at::DeviceType::CUDA, *primary_ctx_device_index)); } - // Round up the allocation to the nearest power of two to improve reuse. - size_t roundSize = c10::llvm::PowerOf2Ceil(size); - void* ptr = nullptr; if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: pinned_use_cuda_host_register()) { - allocWithCudaHostRegister(&ptr, roundSize); + allocWithCudaHostRegister(ptr, size); } else { // Use cudaHostAlloc for allocating pinned memory (global lock in driver) - C10_CUDA_CHECK(cudaHostAlloc(&ptr, roundSize, cudaHostAllocDefault)); + C10_CUDA_CHECK(cudaHostAlloc(ptr, size, cudaHostAllocDefault)); } - - auto block = new Block(); - block->size_ = roundSize; - block->ptr_ = ptr; - block->allocated_ = true; - - { - std::lock_guard g(blocks_mutex_); - blocks_.insert(block); - ptr_to_block_.insert({block->ptr_, block}); - } - return {block->ptr_, reinterpret_cast(block)}; } - void free(void* ctx) { - if (!ctx) { - return; - } - - // Note: we can assume that free is correctly paired with alloc, - // and thus we do not need to look up the ctx in blocks_. - auto* block = reinterpret_cast(ctx); - - c10::optional> events; - { - std::lock_guard g(block->mutex_); - block->allocated_ = false; - if (block->streams_.empty()) { - TORCH_INTERNAL_ASSERT(block->event_count_ == 0); - } else { - events = std::vector(); - events->reserve(block->streams_.size()); - for (auto stream : block->streams_) { - auto event = event_pool_.get(stream.device_index()); - event->record(stream); - events->push_back(std::move(event)); - } - block->event_count_ += events->size(); - block->streams_.clear(); - } - } - - if (!events) { - std::lock_guard g(free_list_mutex_); - free_list_.insert(block); + void free_block(Block* block) override { + if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: + pinned_use_cuda_host_register()) { + void* ptr = block->ptr_; + AT_CUDA_CHECK(cudaHostUnregister(ptr)); + free(ptr); } else { - std::lock_guard g(cuda_events_mutex_); - for (auto&& event : *events) { - cuda_events_.emplace_front(std::move(event), block); - } + AT_CUDA_CHECK(cudaFreeHost(block->ptr_)); } } - bool record_event(void* ptr, void* ctx, at::cuda::CUDAStream stream) { - auto* block = reinterpret_cast(ctx); - - // Note: we need to check if the passed-in `ctx` is valid. This is because - // `record_event` (via `CachingHostAllocator_recordEvent`) can be invoked on - // an arbitrary tensor, and is not guaranteed to correspond to a pinned - // memory allocation. Therefore, we need to check that `ctx` is valid before - // proceeding. - { - std::lock_guard g(blocks_mutex_); - if (blocks_.find(block) != blocks_.end()) { - // Now we know this object is safe to access. - std::lock_guard gb(block->mutex_); - TORCH_INTERNAL_ASSERT(block->allocated_); - block->streams_.insert(stream); - return true; - } - auto it = ptr_to_block_.find(ptr); - if (it != ptr_to_block_.end()) { - block = it->second; - std::lock_guard g(block->mutex_); - TORCH_INTERNAL_ASSERT(block->allocated_); - block->streams_.insert(stream); - return true; - } - } - - return false; + void record_stream( + c10::optional>& events, + CUDAStream stream) override { + auto event = create_event_internal(stream.device_index()); + event->record(stream); + events->push_back(std::move(event)); } - void empty_cache() { - // Flush any available blocks into the free_list. - process_events(); - - // Release cached events from the event pool. - event_pool_.empty_cache(); - - // Remove all elements from the free list, remove them from the blocks - // list, and free the associated pinned memory allocation. This requires - // concurrently holding both the free list mutex and the blocks mutex, and - // is the only function that concurrently holds multiple mutexes. - std::lock(free_list_mutex_, blocks_mutex_); - std::lock_guard gf(free_list_mutex_, std::adopt_lock); - std::lock_guard gb(blocks_mutex_, std::adopt_lock); - - std::vector blocks_to_remove(free_list_.begin(), free_list_.end()); - free_list_.clear(); - for (auto* block : blocks_to_remove) { - blocks_.erase(block); - ptr_to_block_.erase(block->ptr_); - if (c10::cuda::CUDACachingAllocator::CUDAAllocatorConfig:: - pinned_use_cuda_host_register()) { - void* ptr = block->ptr_; - AT_CUDA_CHECK(cudaHostUnregister(ptr)); - free(ptr); - } else { - AT_CUDA_CHECK(cudaFreeHost(block->ptr_)); - } - delete block; + bool query_event(EventPool::Event& event) override { + cudaError_t err = cudaEventQuery(*event); + if (err == cudaErrorNotReady) { + (void)cudaGetLastError(); // clear CUDA error + return false; + } else if (err != cudaSuccess) { + C10_CUDA_CHECK(err); } + return true; } - private: - void process_events() { - while (true) { - // Avoid calling cudaEventDestroy while holding a mutex, so move - // intermediate events out of the lock into this object. - c10::optional> processed; - - { - std::lock_guard g(cuda_events_mutex_); - if (!cuda_events_.empty()) { - processed = std::move(cuda_events_.back()); - cuda_events_.pop_back(); - } - } - - if (!processed) { - return; - } - - // otherwise, query the event - { - // now, see if we can handle this element - auto& event = processed->first; - cudaError_t err = cudaEventQuery(*event); - if (err == cudaErrorNotReady) { - (void)cudaGetLastError(); // clear CUDA error - // push the event onto the back of the queue if it's not - // ready. TODO: do we need some debouncing logic to avoid allocating - // threads repeatedly spinning on an event? - { - std::lock_guard g(cuda_events_mutex_); - cuda_events_.push_back(std::move(*processed)); - } - return; - } else if (err != cudaSuccess) { - C10_CUDA_CHECK(err); - } - } - - // Process the events. - TORCH_INTERNAL_ASSERT(processed); - auto* block = processed->second; - bool available = false; - { - std::lock_guard g(block->mutex_); - TORCH_INTERNAL_ASSERT(!block->allocated_) - block->event_count_--; - if (block->event_count_ == 0) { - available = true; - } - } - - if (available) { - std::lock_guard g(free_list_mutex_); - free_list_.insert(block); - } - } + EventPool::Event create_event_internal(DeviceIndex idx) { + // Leak the event pool to avoid shutdown issue. + static auto* event_pool = new EventPool(); + return event_pool->get(idx); } TaskThreadPool* getThreadPool() { @@ -402,7 +171,7 @@ class CUDAHostAllocator { ""); } - inline void allocWithCudaHostRegister(void** ptr, size_t roundSize) { + void allocWithCudaHostRegister(void** ptr, size_t roundSize) { // Here we do regular allocation, pre-fault/map the pages, and then do // cudaHostRegister with GPU mapping flags to lock the pages, so we // can minimize the cost for the cuda global lock. @@ -423,13 +192,19 @@ class CUDAHostAllocator { for (size_t i = 0; i < numMapThreads; i++) { promises.emplace_back(); futures.push_back(promises[i].get_future()); - auto task = [this, i, ptr, roundSize, numMapThreads, pageSize, &promises]() mutable { + auto task = [this, + i, + ptr, + roundSize, + numMapThreads, + pageSize, + &promises]() mutable { mapPagesForRegister( - *ptr, - roundSize, - i, // thread task-id - numMapThreads, - pageSize); + *ptr, + roundSize, + i, // thread task-id + numMapThreads, + pageSize); // set the promise when mapping pages are done promises[i].set_value(); }; @@ -446,62 +221,48 @@ class CUDAHostAllocator { // Register the mapped pages using cudaHostRegister registerPages(*ptr, roundSize); } +}; - EventPool event_pool_; - - alignas(64) std::mutex blocks_mutex_; - std::unordered_set blocks_; - std::unordered_map ptr_to_block_; - // Note: sharding this mutex seems to be profitable in heavily multi-threaded - // scenarios. - alignas(64) std::mutex free_list_mutex_; - // Note: an alternative datastructure can yield significant wins here in - // microbenchmarks. - std::set free_list_; +void raw_local_deleter(void* ptr); - alignas(64) std::mutex cuda_events_mutex_; - std::deque> cuda_events_; +struct CUDACachingHostAllocator final + : public CachingHostAllocatorInterface { + at::DataPtr allocate(size_t size) override { + auto ptr_and_ctx = impl_->allocate(size); + return { + ptr_and_ctx.first, + ptr_and_ctx.second, + &raw_local_deleter, + at::DeviceType::CPU}; + } }; -} // namespace +CUDACachingHostAllocator caching_host_allocator; -static CUDAHostAllocator& getCUDAHostAllocator() { - // leak and don't worry about shutdown - static auto* r = new CUDAHostAllocator(); - return *r; +static inline CUDACachingHostAllocator& getCUDACachingHostAllocator() { + return caching_host_allocator; } -static void CUDAHostAllocatorDeleter(void* ctx) { - getCUDAHostAllocator().free(ctx); +void raw_local_deleter(void* ptr) { + getCUDACachingHostAllocator().free(ptr); } +} // anonymous namespace + bool CachingHostAllocator_recordEvent( void* ptr, void* ctx, at::cuda::CUDAStream stream) { - return getCUDAHostAllocator().record_event(ptr, ctx, stream); + return getCUDACachingHostAllocator().record_event(ptr, ctx, stream); } // Releases cached pinned memory allocations via cudaHostFree void CachingHostAllocator_emptyCache() { - getCUDAHostAllocator().empty_cache(); + getCUDACachingHostAllocator().empty_cache(); } -struct CUDAHostAllocatorWrapper final : public at::Allocator { - at::DataPtr allocate(size_t size) const override { - auto ptr_and_ctx = getCUDAHostAllocator().allocate(size); - return { - ptr_and_ctx.first, - ptr_and_ctx.second, - &CUDAHostAllocatorDeleter, - at::DeviceType::CPU}; - } -}; - -static CUDAHostAllocatorWrapper cuda_host_allocator; - at::Allocator* getCachingHostAllocator() { - return &cuda_host_allocator; + return &getCUDACachingHostAllocator(); } } // namespace at::cuda diff --git a/aten/src/ATen/cuda/CachingHostAllocator.h b/aten/src/ATen/cuda/CachingHostAllocator.h index 65ad7f7d16e24..a7209582b2ba1 100644 --- a/aten/src/ATen/cuda/CachingHostAllocator.h +++ b/aten/src/ATen/cuda/CachingHostAllocator.h @@ -1,5 +1,6 @@ #pragma once +#include #include #include @@ -17,15 +18,14 @@ namespace at::cuda { // call between host and device, and passed the corresponding context from the // allocation. This is currently invoked by at::native::copy_kernel_cuda. // -// Note that this allocator does not split larger allocations into smaller -// blocks, unlike the caching device allocator. -// TORCH_CUDA_CPP_API c10::Allocator* getCachingHostAllocator(); // Records an event in the specified stream. The allocation corresponding to the // input `ptr`/`ctx` will not be re-used until the event has occurred. -TORCH_CUDA_CPP_API bool -CachingHostAllocator_recordEvent(void* ptr, void* ctx, c10::cuda::CUDAStream stream); +TORCH_CUDA_CPP_API bool CachingHostAllocator_recordEvent( + void* ptr, + void* ctx, + c10::cuda::CUDAStream stream); // Releases cached pinned memory allocations via cudaHostFree TORCH_CUDA_CPP_API void CachingHostAllocator_emptyCache(); diff --git a/aten/src/ATen/cuda/CuSparseHandlePool.cpp b/aten/src/ATen/cuda/CuSparseHandlePool.cpp index 1a57044138ab2..58ba5019dff50 100644 --- a/aten/src/ATen/cuda/CuSparseHandlePool.cpp +++ b/aten/src/ATen/cuda/CuSparseHandlePool.cpp @@ -26,7 +26,7 @@ using CuSparsePoolType = DeviceThreadHandlePool +#include #include #include @@ -76,7 +77,7 @@ using CuBlasPoolType = DeviceThreadHandlePoolreserve(device); auto stream = c10::cuda::getCurrentCUDAStream(); TORCH_CUDABLAS_CHECK(cublasSetStream(handle, stream)); -#if !defined(USE_ROCM) && defined(CUDA_VERSION) && CUDA_VERSION < 12200 - // cuBLAS should not need an explicitly allocated workspace after CUDA 12.2 - // to avoid increasing memory usage during graph captures +#if !defined(USE_ROCM) + // We explicitly set the cublas workspace even though CUDA 12.2+ fixed the + // issue where memory usage increased during graph capture. // original issue: https://github.com/pytorch/pytorch/pull/83461 + // This is because in CUDA 12.2+, the use of cudaMallocAsync in cublas + // will allocate memory dynamically (even if they're cheap) outside + // PyTorch's CUDA caching allocator. It's possible that CCA used up + // all the memory and cublas's cudaMallocAsync will return OOM cudaStream_t _stream = stream; auto key = std::make_tuple(static_cast(handle), static_cast(_stream)); auto workspace_it = cublas_handle_stream_to_workspace().find(key); @@ -154,8 +171,6 @@ cublasHandle_t getCurrentCUDABlasHandle() { workspace_it = cublas_handle_stream_to_workspace().insert(workspace_it, {key, getNewWorkspace()}); } TORCH_CUDABLAS_CHECK(cublasSetWorkspace(handle, workspace_it->second.get(), getChosenWorkspaceSize())); -#endif -#if !defined(USE_ROCM) // On CUDA >= 11, and architecture >= Ampere, cuBLAS can use TF32 to speedup // FP32 data type calculations based on the value of the allow_tf32 flag. // To enable TF32, set the math mode of the handle to CUBLAS_TF32_TENSOR_OP_MATH. @@ -164,8 +179,7 @@ cublasHandle_t getCurrentCUDABlasHandle() { } else { TORCH_CUDABLAS_CHECK(cublasSetMathMode(handle, CUBLAS_DEFAULT_MATH)); } -#endif -#if defined(USE_ROCM) +#else hipblasAtomicsMode_t hipblas_mode; if (at::globalContext().deterministicAlgorithms()) { hipblas_mode = HIPBLAS_ATOMICS_NOT_ALLOWED; @@ -177,10 +191,10 @@ cublasHandle_t getCurrentCUDABlasHandle() { return handle; } -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) cublasLtHandle_t getCurrentCUDABlasLtHandle() { #ifdef USE_ROCM - int device; + c10::DeviceIndex device = 0; AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); // Thread local PoolWindows are lazily-initialized diff --git a/aten/src/ATen/cuda/Exceptions.h b/aten/src/ATen/cuda/Exceptions.h index a15f0d7947ec2..c647bc2531b4b 100644 --- a/aten/src/ATen/cuda/Exceptions.h +++ b/aten/src/ATen/cuda/Exceptions.h @@ -21,6 +21,15 @@ class CuDNNError : public c10::Error { } // namespace c10 +#define AT_CUDNN_FRONTEND_CHECK(EXPR, ...) \ + do { \ + auto error_object = EXPR; \ + if (!error_object.is_good()) { \ + TORCH_CHECK_WITH(CuDNNError, false, \ + "cuDNN Frontend error: ", error_object.get_message()); \ + } \ + } while (0) \ + #define AT_CUDNN_CHECK_WITH_SHAPES(EXPR, ...) AT_CUDNN_CHECK(EXPR, "\n", ##__VA_ARGS__) // See Note [CHECK macro] diff --git a/aten/src/ATen/cuda/cub-RadixSortKeys.cu b/aten/src/ATen/cuda/cub-RadixSortKeys.cu index cf88c8aa0cc89..74e82ae55cdee 100644 --- a/aten/src/ATen/cuda/cub-RadixSortKeys.cu +++ b/aten/src/ATen/cuda/cub-RadixSortKeys.cu @@ -51,5 +51,8 @@ void radix_sort_keys( int64_t end_bit); AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTATIATE_CUB_TEMPLATES) +AT_INSTATIATE_CUB_TEMPLATES(uint16_t, UInt16) +AT_INSTATIATE_CUB_TEMPLATES(uint32_t, UInt32) +AT_INSTATIATE_CUB_TEMPLATES(uint64_t, UInt64) } // namespace at::cuda::cub diff --git a/aten/src/ATen/cuda/cub-RadixSortPairs.cu b/aten/src/ATen/cuda/cub-RadixSortPairs.cu index bd20069cf6ad0..cc7c969300104 100644 --- a/aten/src/ATen/cuda/cub-RadixSortPairs.cu +++ b/aten/src/ATen/cuda/cub-RadixSortPairs.cu @@ -77,6 +77,9 @@ AT_INSTANTIATE_SORT_PAIRS(int64_t, 4) AT_INSTANTIATE_SORT_PAIRS(scalar_t, 8) AT_FORALL_SCALAR_TYPES_AND2(Bool, Half, AT_INSTANTIATE_SORT_PAIRS_8) +AT_INSTANTIATE_SORT_PAIRS(uint16_t, 8) +AT_INSTANTIATE_SORT_PAIRS(uint32_t, 8) +AT_INSTANTIATE_SORT_PAIRS(uint64_t, 8) // BFloat16 Radix sort is supported from ROCm 4.5 onwards #if !AT_ROCM_ENABLED() || (AT_ROCM_ENABLED() && ROCM_VERSION >= 40500) diff --git a/aten/src/ATen/cuda/cub.cuh b/aten/src/ATen/cuda/cub.cuh index 9663f354f764c..062c365a4e1a9 100644 --- a/aten/src/ATen/cuda/cub.cuh +++ b/aten/src/ATen/cuda/cub.cuh @@ -6,8 +6,6 @@ #include #include -#include - #include #if USE_GLOBAL_CUB_WRAPPED_NAMESPACE() diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.cpp b/aten/src/ATen/cuda/detail/CUDAHooks.cpp index 24c29a6381544..d3b80af2e8599 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.cpp +++ b/aten/src/ATen/cuda/detail/CUDAHooks.cpp @@ -137,7 +137,7 @@ bool CUDAHooks::isPinnedPtr(const void* data) const { cudaPointerAttributes attr; // We do not believe that CUDA needs mutable access to the data // here. - cudaError_t err = cudaPointerGetAttributes(&attr, const_cast(data)); + cudaError_t err = cudaPointerGetAttributes(&attr, data); #if !defined(USE_ROCM) if (err == cudaErrorInvalidValue) { (void)cudaGetLastError(); // clear CUDA error @@ -184,6 +184,16 @@ bool CUDAHooks::hasCuSOLVER() const { #endif } +bool CUDAHooks::hasCuBLASLt() const { +#if defined(CUDART_VERSION) + return true; +#elif AT_ROCM_ENABLED() && defined(ROCM_VERSION) && ROCM_VERSION >= 50700 + return true; +#else + return false; +#endif +} + bool CUDAHooks::hasROCM() const { // Currently, this is same as `compiledWithMIOpen`. // But in future if there are ROCm builds without MIOpen, @@ -227,7 +237,7 @@ const at::cuda::NVRTC& CUDAHooks::nvrtc() const { } DeviceIndex current_device() { - int device; + c10::DeviceIndex device = 0; cudaError_t err = c10::cuda::GetDevice(&device); if (err == cudaSuccess) { return device; diff --git a/aten/src/ATen/cuda/detail/CUDAHooks.h b/aten/src/ATen/cuda/detail/CUDAHooks.h index dddeab1e2675f..2002bd1b77402 100644 --- a/aten/src/ATen/cuda/detail/CUDAHooks.h +++ b/aten/src/ATen/cuda/detail/CUDAHooks.h @@ -27,6 +27,7 @@ struct CUDAHooks : public at::CUDAHooksInterface { bool hasMAGMA() const override; bool hasCuDNN() const override; bool hasCuSOLVER() const override; + bool hasCuBLASLt() const override; bool hasROCM() const override; const at::cuda::NVRTC& nvrtc() const override; DeviceIndex current_device() const override; diff --git a/aten/src/ATen/cuda/detail/IndexUtils.cuh b/aten/src/ATen/cuda/detail/IndexUtils.cuh index 1eceaf690f5a1..db8519389e9ff 100644 --- a/aten/src/ATen/cuda/detail/IndexUtils.cuh +++ b/aten/src/ATen/cuda/detail/IndexUtils.cuh @@ -21,8 +21,16 @@ getTensorInfo(const at::TensorBase &t) { st[i] = t.stride(i); } + scalar* data_ptr = nullptr; + + if constexpr (std::is_const::value) { + data_ptr = t.const_data_ptr(); + } else { + data_ptr = t.mutable_data_ptr(); + } + return TensorInfo( - t.data_ptr(), dims, sz, st); + data_ptr, dims, sz, st); } } // namespace at::cuda::detail diff --git a/aten/src/ATen/cuda/jiterator.cu b/aten/src/ATen/cuda/jiterator.cu index 0a4ac757b1ada..db751e33c43d2 100644 --- a/aten/src/ATen/cuda/jiterator.cu +++ b/aten/src/ATen/cuda/jiterator.cu @@ -339,7 +339,7 @@ c10::SmallVector CompileAndLaunchKernel( config.add_owned_output(outs[i]); } for (const auto& t: tensors) { - config.add_input(t); + config.add_const_input(t); } TensorIterator iter = config.build(); diff --git a/aten/src/ATen/cuda/tunable/GemmCommon.h b/aten/src/ATen/cuda/tunable/GemmCommon.h new file mode 100644 index 0000000000000..a1d7d0dc21638 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/GemmCommon.h @@ -0,0 +1,217 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif + +namespace at::cuda::tunable { + +enum class BlasOp { + N = 0, + T = 1 +}; + +inline std::string BlasOpToString(BlasOp op) { + switch (op) { + case BlasOp::N: + return "N"; + case BlasOp::T: + return "T"; + } + TORCH_CHECK(false, "unrecognized BlasOp"); + return "N"; +} + +namespace detail { + +static bool NumericalCheck(ScalarType dtype, void* c, void* other_c, int64_t size) { + auto options = at::TensorOptions().dtype(dtype).device(at::kCUDA); + // comparison done as 1D tensor + at::Tensor ref = at::from_blob(c, {size}, options); + at::Tensor oth = at::from_blob(other_c, {size}, options); + at::Tensor ref_float = ref.to(at::kFloat); + at::Tensor oth_float = oth.to(at::kFloat); + std::vector atols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + std::vector rtols{1e-1, 1e-2, 1e-3, 1e-4, 1e-5}; + double last_succeed_atol = 1; + double last_succeed_rtol = 1; + for (auto& atol : atols) { + for (auto& rtol : rtols) { + if (at::allclose(ref_float, oth_float, rtol, atol)) { + last_succeed_atol = atol; + last_succeed_rtol = rtol; + } + } + } + if (last_succeed_atol == 1) { + return false; + } + else { + TUNABLE_LOG("ā”œā”€ā”€verify numerics: atol=", last_succeed_atol, ", rtol=", last_succeed_rtol); + } + + return true; +} + +} + +template +struct GemmParams : OpParams { + std::string Signature() const override { + return c10::str(transa, transb, "_", m, "_", n, "_", k); + } + + GemmParams* DeepCopy() const { + GemmParams* copy = new GemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = m * n * sizeof(T); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + } + + TuningStatus NumericalCheck(GemmParams *other) { + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL; + } + + char transa; + char transb; + int64_t m; + int64_t n; + int64_t k; + at::opmath_type alpha; + const T* a; + int64_t lda; + const T* b; + int64_t ldb; + at::opmath_type beta; + T* c; + int64_t ldc; +}; + +template +struct GemmStridedBatchedParams : OpParams { + std::string Signature() const override { + return c10::str(transa, transb, "_", m, "_", n, "_", k, "_B_", batch); + } + + GemmStridedBatchedParams* DeepCopy() const { + GemmStridedBatchedParams* copy = new GemmStridedBatchedParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = batch * stride_c * sizeof(T); + copy->c = static_cast(c10::cuda::CUDACachingAllocator::raw_alloc(c_size)); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + } + + TuningStatus NumericalCheck(GemmStridedBatchedParams *other) { + auto c_dtype = c10::CppTypeToScalarType::value; + return detail::NumericalCheck(c_dtype, c, other->c, batch*stride_c) ? OK : FAIL; + } + + char transa; + char transb; + int64_t m; + int64_t n; + int64_t k; + at::opmath_type alpha; + const T* a; + int64_t lda; + int64_t stride_a; + const T* b; + int64_t ldb; + int64_t stride_b; + at::opmath_type beta; + T* c; + int64_t ldc; + int64_t stride_c; + int64_t batch; +}; + +template +struct ScaledGemmParams : OpParams { + std::string Signature() const override { + return c10::str(transa, transb, "_", m, "_", n, "_", k); + } + + ScaledGemmParams* DeepCopy() const { + ScaledGemmParams* copy = new ScaledGemmParams; + *copy = *this; + c10::DeviceIndex device = 0; + AT_CUDA_CHECK(c10::cuda::GetDevice(&device)); + size_t c_size = m * n * sizeof(T); + copy->c = c10::cuda::CUDACachingAllocator::raw_alloc(c_size); + AT_CUDA_CHECK(c10::cuda::CUDACachingAllocator::memcpyAsync( + copy->c, device, c, device, c_size, getCurrentCUDAStream(device), true)); + return copy; + } + + // only call on object returned by DeepCopy + void Delete() { + c10::cuda::CUDACachingAllocator::raw_delete(c); + } + + TuningStatus NumericalCheck(ScaledGemmParams *other) { + return detail::NumericalCheck(c_dtype, c, other->c, m*n) ? OK : FAIL; + } + + char transa; + char transb; + int64_t m; + int64_t n; + int64_t k; + const void* a; + const void* a_scale_ptr; + int64_t lda; + ScalarType a_dtype; + const void* b; + const void* b_scale_ptr; + int64_t ldb; + ScalarType b_dtype; + const void* bias_ptr; + ScalarType bias_dtype; + void* c; + const void* c_scale_ptr; + int64_t ldc; + ScalarType c_dtype; + void* amax_ptr; + bool use_fast_accum; +}; + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/GemmHipblaslt.h b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h new file mode 100644 index 0000000000000..da1483aee72c0 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/GemmHipblaslt.h @@ -0,0 +1,591 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include +#include +#include + +#include +#include + +#define TORCH_HIPBLASLT_CHECK(EXPR) \ + do { \ + hipblasStatus_t __err = EXPR; \ + TORCH_CHECK(__err == HIPBLAS_STATUS_SUCCESS, \ + "hipblaslt error: ", \ + hipblasStatusToString(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +#ifdef HIPBLASLT_HAS_GETINDEXFROMALGO +#define GETINDEXFROMALGO(algo) hipblaslt_ext::getIndexFromAlgo(algo) +#else +static int getIndexFromAlgo(hipblasLtMatmulAlgo_t& algo) { + int* algo_ptr = (int*)algo.data; + if(*algo_ptr < 0) { + return -1; + } + return *algo_ptr; +} +#define GETINDEXFROMALGO(algo) getIndexFromAlgo(algo) +#endif + +#ifdef HIPBLASLT_CUSTOM_COMPUTE_TYPE +#define COMPUTE_TYPE_32 HIPBLASLT_COMPUTE_F32 +#else +#define COMPUTE_TYPE_32 HIPBLAS_COMPUTE_32F +#endif + +#ifdef HIPBLASLT_CUSTOM_DATA_TYPE + +template +constexpr hipblasltDatatype_t HipBlasDataTypeFor(); + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_32F; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_16F; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_16B; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_64F; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_8F_E4M3; +} + +template <> +constexpr hipblasltDatatype_t HipBlasDataTypeFor() { + return HIPBLASLT_R_8F_E5M3; +} + +#define DATA_TYPE_R_32 HIPBLASLT_R_32F + +#else + +template +constexpr hipblasDatatype_t HipBlasDataTypeFor(); + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_32F; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_16F; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_16B; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIPBLAS_R_64F; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIP_R_8F_E4M3_FNUZ; +} + +template <> +constexpr hipblasDatatype_t HipBlasDataTypeFor() { + return HIP_R_8F_E5M2_FNUZ; +} + +#ifdef HIPBLAS_V2 +#define DATA_TYPE_R_32 HIP_R_32F +#else +#define DATA_TYPE_R_32 HIPBLAS_R_32F +#endif + +#endif + +template +int GetBatchFromParams(const GemmParams* params) { + return 1; +} + +template +int GetBatchFromParams(const GemmStridedBatchedParams* params) { + return params->batch; +} + +template +int GetBatchFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideAFromParams(const GemmStridedBatchedParams* params) { + return params->stride_a; +} + +template +int GetStrideAFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideBFromParams(const GemmStridedBatchedParams* params) { + return params->stride_b; +} + +template +int GetStrideBFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmParams* params) { + return 1; +} + +template +int GetStrideCFromParams(const GemmStridedBatchedParams* params) { + return params->stride_c; +} + +template +int GetStrideCFromParams(const ScaledGemmParams* params) { + return 1; +} + +template +float GetAlphaFromParams(const GemmParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const GemmStridedBatchedParams* params) { + return params->alpha; +} + +template +float GetAlphaFromParams(const ScaledGemmParams* params) { + return 1.0; +} + +template +float GetBetaFromParams(const GemmParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const GemmStridedBatchedParams* params) { + return params->beta; +} + +template +float GetBetaFromParams(const ScaledGemmParams* params) { + return 0.0; +} + +template +const void* GetAScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetAScalePointerFromParams(const ScaledGemmParams* params) { + return params->a_scale_ptr; +} + +template +const void* GetBScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBScalePointerFromParams(const ScaledGemmParams* params) { + return params->b_scale_ptr; +} + +template +const void* GetDScalePointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetDScalePointerFromParams(const ScaledGemmParams* params) { + return params->c_scale_ptr; +} + +template +const void* GetBiasPointerFromParams(const GemmParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const GemmStridedBatchedParams* params) { + return nullptr; +} + +template +const void* GetBiasPointerFromParams(const ScaledGemmParams* params) { + return params->bias_ptr; +} + +template +hipDataType GetBiasTypeFromParams(const GemmParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const GemmStridedBatchedParams* params) { + return HIP_R_32F; +} + +template +hipDataType GetBiasTypeFromParams(const ScaledGemmParams* params) { + return at::cuda::ScalarTypeToCudaDataType(params->bias_dtype); +} + +static hipblasOperation_t _hipblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return HIPBLAS_OP_N; + case 't': + case 'T': + return HIPBLAS_OP_T; + case 'c': + case 'C': + return HIPBLAS_OP_C; + } + AT_ERROR( + "_hipblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +static char _charFromhipblasOp(hipblasOperation_t op) { + switch (op) { + case HIPBLAS_OP_N: + return 'N'; + case HIPBLAS_OP_T: + return 'T'; + case HIPBLAS_OP_C: + return 'C'; + } + AT_ERROR( + "_charFromhipblasOp input should be HIPBLAS_OP_N/T/C but got `", op, "`"); +} + +static hipblasOperation_t MapLayoutToHipBlasLt(BlasOp layout) { + if (layout == BlasOp::N) { + return HIPBLAS_OP_N; + } + return HIPBLAS_OP_T; +} + +static size_t GetHipblasltWorkspaceSize() { + static const char * env = getenv("HIPBLASLT_WORKSPACE_SIZE"); + // 256MB is max workspace size allowed for hipblaslt + // hipblaslt-bench uses 32MB + // recommendation from hipblaslt author was 76MB + size_t workspace_size = 2*128*1024*1024; // default 256MB + if (env) { + try { + workspace_size = std::stoi(env); + } catch(std::invalid_argument const& e) { + TORCH_WARN("invalid HIPBLASLT_WORKSPACE_SIZE,", + " using default workspace size of ", workspace_size, " bytes."); + } catch(std::out_of_range const& e) { + TORCH_WARN("HIPBLASLT_WORKSPACE_SIZE out of range,", + " using default workspace size of ", workspace_size, " bytes."); + } + } + return workspace_size; +} + +template +struct HipBlasLtDeleter { + void operator()(T* x) { + if (x != nullptr) { + TORCH_CUDABLAS_CHECK(destructor(x)); + } + } +}; + +template +class HipBlasLtDescriptor { + public: + T* descriptor() const { + return descriptor_.get(); + } + T* descriptor() { + return descriptor_.get(); + } + + protected: + std::unique_ptr> descriptor_; +}; + +class HipBlasLtMatmulDescriptor : public HipBlasLtDescriptor< + hipblasLtMatmulDescOpaque_t, + &hipblasLtMatmulDescDestroy> { + public: + HipBlasLtMatmulDescriptor( + hipblasComputeType_t compute_type, + hipDataType scale_type) { + hipblasLtMatmulDesc_t raw_descriptor = nullptr; + TORCH_HIPBLASLT_CHECK( + hipblasLtMatmulDescCreate(&raw_descriptor, compute_type, scale_type)); + descriptor_.reset(raw_descriptor); + } + template + inline void setAttribute(hipblasLtMatmulDescAttributes_t attr, const T value) { + TORCH_HIPBLASLT_CHECK(::hipblasLtMatmulDescSetAttribute(descriptor(), attr, &value, sizeof(T))); + } +}; + +template +class HipblasltGemmOp : public Callable { + public: + HipblasltGemmOp(hipblasLtMatmulAlgo_t algo) : algo_{algo} {} + + TuningStatus Call(const ParamsT* params) override { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipBlasDataTypeFor(); + auto b_datatype = HipBlasDataTypeFor(); + auto in_out_datatype = HipBlasDataTypeFor(); + auto opa = _hipblasOpFromChar(params->transa); + auto opb = _hipblasOpFromChar(params->transb); + + TORCH_CHECK(transa_outer == opa && transb_outer == opb, "trans mismatch, shouldn't happen"); + + float alpha = GetAlphaFromParams(params); + float beta = GetBetaFromParams(params); + + hipblasLtMatrixLayout_t mat_a, mat_b, mat_c; + if (opa == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->m, params->k, params->lda)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_a, a_datatype, params->k, params->m, params->lda)); + } + if (opb == HIPBLAS_OP_N) { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->k, params->n, params->ldb)); + } + else { + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_b, b_datatype, params->n, params->k, params->ldb)); + } + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutCreate(&mat_c, in_out_datatype, params->m, params->n, params->ldc)); + + // specific to batched gemmm + int batch = GetBatchFromParams(params); + if (batch > 1) { + int64_t stride_a = GetStrideAFromParams(params); + int64_t stride_b = GetStrideBFromParams(params); + int64_t stride_c = GetStrideCFromParams(params); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_a, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_a, sizeof(stride_a))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_b, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_b, sizeof(stride_b))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_BATCH_COUNT, &batch, sizeof(batch))); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutSetAttribute( + mat_c, HIPBLASLT_MATRIX_LAYOUT_STRIDED_BATCH_OFFSET, &stride_c, sizeof(stride_c))); + } + + HipBlasLtMatmulDescriptor matmul(COMPUTE_TYPE_32, DATA_TYPE_R_32); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSA, opa); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_TRANSB, opb); + + // specific to scaled gemm + const void* mat1_scale_ptr = GetAScalePointerFromParams(params); + const void* mat2_scale_ptr = GetBScalePointerFromParams(params); + const void* result_scale_ptr = GetDScalePointerFromParams(params); + if (mat1_scale_ptr && mat2_scale_ptr && result_scale_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_A_SCALE_POINTER, mat1_scale_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_B_SCALE_POINTER, mat2_scale_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_D_SCALE_POINTER, result_scale_ptr); + + const void* bias_ptr = GetBiasPointerFromParams(params); + auto bias_datatype = GetBiasTypeFromParams(params); + if (bias_ptr) { + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_POINTER, bias_ptr); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_EPILOGUE, HIPBLASLT_EPILOGUE_BIAS); + matmul.setAttribute(HIPBLASLT_MATMUL_DESC_BIAS_DATA_TYPE, bias_datatype); + } + } + + size_t workspace_size = GetHipblasltWorkspaceSize(); + + auto op_handle = at::cuda::getCurrentCUDABlasLtHandle(); + + size_t ret_workspace_size = 0; + auto status = hipblaslt_ext::matmulIsAlgoSupported(op_handle, + matmul.descriptor(), + &alpha, + mat_a, + mat_b, + &beta, + mat_c, + mat_c, + algo_, + ret_workspace_size); + + if (status == HIPBLAS_STATUS_SUCCESS) { + if (ret_workspace_size >= workspace_size) { + //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " workspace too large"); + return FAIL; + } + } + else { + //TUNABLE_LOG("[hipBLASLt] Solution #", algo_index, " not supported"); + return FAIL; + } + + void* workspace_buffer = nullptr; + if (workspace_size > 0) { + workspace_buffer = c10::cuda::CUDACachingAllocator::raw_alloc(workspace_size); + } + + TORCH_HIPBLASLT_CHECK(hipblasLtMatmul(op_handle, + matmul.descriptor(), + &alpha, + params->a, + mat_a, + params->b, + mat_b, + &beta, + params->c, + mat_c, + params->c, + mat_c, + &algo_, + workspace_buffer, + workspace_size, + at::cuda::getCurrentCUDAStream())); + + //TORCH_HIPBLASLT_CHECK(hipblasLtMatmulDescDestroy(matmul)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_a)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_b)); + TORCH_HIPBLASLT_CHECK(hipblasLtMatrixLayoutDestroy(mat_c)); + if (workspace_size > 0) { + c10::cuda::CUDACachingAllocator::raw_delete(workspace_buffer); + } + return OK; + } + + private: + hipblasLtMatmulAlgo_t algo_; +}; + +template +auto GetHipBlasLtTypeStringAndOps() { + hipblasOperation_t transa_outer = MapLayoutToHipBlasLt(ALayout); + hipblasOperation_t transb_outer = MapLayoutToHipBlasLt(BLayout); + auto a_datatype = HipBlasDataTypeFor(); + auto b_datatype = HipBlasDataTypeFor(); + auto in_out_datatype = HipBlasDataTypeFor(); + std::vector heuristic_result; + + hipblasLtHandle_t handle; + TORCH_HIPBLASLT_CHECK(hipblasLtCreate(&handle)); + TORCH_HIPBLASLT_CHECK(hipblaslt_ext::getAllAlgos(handle, + hipblaslt_ext::GemmType::HIPBLASLT_GEMM, + transa_outer, + transb_outer, + a_datatype, + b_datatype, + in_out_datatype, + in_out_datatype, + COMPUTE_TYPE_32, + heuristic_result)); + TORCH_HIPBLASLT_CHECK(hipblasLtDestroy(handle)); + + // Sort heuristic_result by algo index to make sure the order of returned algos is deterministic. + std::sort(heuristic_result.begin(), + heuristic_result.end(), + [](hipblasLtMatmulHeuristicResult_t& a, hipblasLtMatmulHeuristicResult_t& b) { + return GETINDEXFROMALGO(a.algo) < GETINDEXFROMALGO(b.algo); + }); + + int returned_algo_count = heuristic_result.size(); + std::vector>>> ret; + for (int i = 0; i < returned_algo_count; i++) { + auto algo = heuristic_result[i].algo; + int algo_index = GETINDEXFROMALGO(algo); + auto callable = std::make_unique>(algo); + std::string type_string = c10::str( + "Gemm_Hipblaslt_", _charFromhipblasOp(transa_outer), _charFromhipblasOp(transb_outer), "_", algo_index); + ret.emplace_back(type_string, std::move(callable)); + } + + return ret; +} + +template +auto GetHipBlasLtGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtGemmStridedBatchedTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +template +auto GetHipBlasLtScaledGemmTypeStringAndOps() { + return GetHipBlasLtTypeStringAndOps>(); +} + +#undef TORCH_HIPBLASLT_CHECK +#undef GETINDEXFROMALGO +#undef COMPUTE_TYPE_32 +#undef DATA_TYPE_R_32 + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/GemmRocblas.h b/aten/src/ATen/cuda/tunable/GemmRocblas.h new file mode 100644 index 0000000000000..f096ff00fd9b4 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/GemmRocblas.h @@ -0,0 +1,275 @@ +// Copyright (c) Microsoft Corporation. All rights reserved. +// Licensed under the MIT License. + +#pragma once + +#include +#include +#include +#include + +#define ROCBLAS_BETA_FEATURES_API +#include + +#define TORCH_ROCBLAS_CHECK(EXPR) \ + do { \ + rocblas_status __err = EXPR; \ + TORCH_CHECK(__err == rocblas_status_success, \ + "rocblas error: ", \ + rocblas_status_to_string(__err), \ + " when calling `" #EXPR "`"); \ + } while (0) + +namespace at::cuda::tunable { + +template +constexpr rocblas_datatype RocBlasDataTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_f16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor() { + return rocblas_datatype_bf16_r; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasDataTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +constexpr rocblas_datatype RocBlasComputeTypeFor(); + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + return rocblas_datatype_f64_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // FP16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor() { + // Note that we're returning the _compute_ type for a given datatype. + // As of 12/2022, using compute type FP16 for 16-bit floats was much + // slower than using compute type FP32. So we use FP32 compute even for + // BF16 datatypes. This is how GEMM is implemented even in the function + // rocblasGemmHelper (see fpgeneric.h) + return rocblas_datatype_f32_r; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f32_c; +} + +template <> +constexpr rocblas_datatype RocBlasComputeTypeFor>() { + return rocblas_datatype_f64_c; +} + +template +auto DoCastForHalfOrBfloat16(const T fp) { + return fp; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const Half fp) { + // alpha and beta should be the same as compute_type, in Half case it is float. + float h = fp; + return h; +} + +template <> +inline auto DoCastForHalfOrBfloat16(const BFloat16 fp) { + // alpha and beta should be the same as compute_type, in bfloat16 case it is float. + float h = fp; + return h; +} + +static rocblas_operation _rocblasOpFromChar(char op) { + switch (op) { + case 'n': + case 'N': + return rocblas_operation_none; + case 't': + case 'T': + return rocblas_operation_transpose; + case 'c': + case 'C': + return rocblas_operation_conjugate_transpose; + } + AT_ERROR( + "_rocblasOpFromChar input should be 't', 'n' or 'c' but got `", op, "`"); +} + +template +class RocblasGemmOp : public Callable> { + public: + RocblasGemmOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, + params->b, input_output_type, params->ldb, + &h_b, + params->c, input_output_type, params->ldc, + params->c, input_output_type, params->ldc, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + // Sort the solutions in ascending order to make the solution vector deterministic across runs + std::sort(solutions.begin(), solutions.end()); + + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable))); + } + return ret; +} + +template +class RocblasGemmStridedBatchedOp : public Callable> { + public: + RocblasGemmStridedBatchedOp(int solution) : solution_{solution} {} + + TuningStatus Call(const GemmStridedBatchedParams* params) override { + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + auto h_a = DoCastForHalfOrBfloat16(params->alpha); + auto h_b = DoCastForHalfOrBfloat16(params->beta); + auto status = rocblas_gemm_strided_batched_ex( + (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(), + _rocblasOpFromChar(params->transa), + _rocblasOpFromChar(params->transb), + params->m, params->n, params->k, + &h_a, + params->a, input_output_type, params->lda, params->stride_a, + params->b, input_output_type, params->ldb, params->stride_b, + &h_b, + params->c, input_output_type, params->ldc, params->stride_c, + params->c, input_output_type, params->ldc, params->stride_c, + params->batch, + compute_type, + rocblas_gemm_algo_solution_index, + solution_, + rocblas_gemm_flags_none); + if (status != rocblas_status_success) { + return FAIL; + } + return OK; + } + + private: + int solution_; +}; + +template +auto GetRocBlasGemmStridedBatchedTypeStringAndOps() { + rocblas_handle handle = (rocblas_handle)at::cuda::getCurrentCUDABlasHandle(); + int solution_size; + auto input_output_type = RocBlasDataTypeFor(); + auto compute_type = RocBlasComputeTypeFor(); + // Get the number of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + nullptr, + &solution_size)); + std::vector solutions(solution_size); + // Get the list of available solutions + TORCH_ROCBLAS_CHECK(rocblas_gemm_ex_get_solutions_by_type(handle, + input_output_type, + input_output_type, + compute_type, + rocblas_gemm_flags_none, + solutions.data(), + &solution_size)); + // Sort the solutions in ascending order to make the solution vector deterministic across runs + std::sort(solutions.begin(), solutions.end()); + + std::vector>>>> ret; + for (size_t i = 0; i < solutions.size(); ++i) { + auto callable = std::make_unique>(solutions[i]); + ret.emplace_back(std::make_pair(c10::str("Gemm_Rocblas_", solutions[i]), std::move(callable))); + } + return ret; +} + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/README.md b/aten/src/ATen/cuda/tunable/README.md new file mode 100644 index 0000000000000..364e6975c6c64 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/README.md @@ -0,0 +1,88 @@ +# TunableOp + +This directory implements a TunableOp interface. + +Some operations, such as GEMMs, could be implemented using more than one library or more than one technique. For +example, a GEMM could be implemented for CUDA or ROCm using either the blas or blasLt libraries. Further, ROCm's +rocblas and hipblaslt libraries allow the user to query for all possible algorithms and then choose one. How does one +know which implementation is the fastest and should be chosen? That's what TunableOp provides. + +The behavior of TunableOp is currently easily manipulated through environment variables, though you could use the C++ +interface of at::cuda::tunable::getTuningContext(). A Python interface to the TuningContext does not yet exist. + +Currently only a TunableGemm for ROCm is implemented. Any call to at::cuda::blas::gemm() can optionally use the +TunableGemm. Calling gemm() for a given set of input arguments (transa, transb, m, n, k) will attempt to use the +fastest available implementation. + +## Environment Variables + +#### PYTORCH_TUNABLEOP_ENABLED +Default is 0. Set to 1 to enable. +This is the big on/off switch for all TunableOp implementations. + +#### PYTORCH_TUNABLEOP_TUNING +Default is 1. Set to 0 to disable. +When enabled, if a tuned entry isn't found, run the tuning step and record the entry. + +#### PYTORCH_TUNABLEOP_VERBOSE +Default is 0. Set to 1 to enable. +This will produce a lot of diagnostic messages but may be useful to see if TunableOp is being used at all. +Otherwise, TunableOp is completely silent unless there is a warning or error during its use. + +#### PYTORCH_TUNABLEOP_FILENAME +Default is 'tunableop_results.csv'. If you provide a filename, the TuningContext will attempt to read it the first time +the context is used. If tuning is enabled and new tunings are discovered, it will also write out to this same filename +with all tunings, both the ones it read in at startup as well as the new ones found at runtime. This can be used, for +example, to build up a tunings file across many workloads by reusing the same file. Unsetting this variable is not +recommended but can be done, in which case the tuning results will not be saved. + +#### PYTORCH_TUNABLEOP_NUMERICAL_CHECK +Default is 1. Set to 0 to disable. Compare the results of each possible solution against the default solution and reject +those with low accuracy. + +#### PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED +Default is 1. Set to 0 to disable hipblaslt being considered during tuning. + +### Tuning Iterations +By default, each possible solution for a given operator will be run for either 100 iterations or as many iterations can +be run within 30ms, whichever is smaller. Its average execution will be calculated. The fastest solution is chosen. In +addition, a set of warm up iterations can optionally be run prior to the timed iterations. The following environment +variables can be used to set either the maximum number of iterations to attempt or the maximum amount of time allowed in +milliseconds, or both, in which case the smaller of the two values used. + +#### PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS +Default is 30. + +#### PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS +Default is 100. + +#### PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS +Default is 0, meaning it is not used. + +#### PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS +Default is 1. + +## File Output + +Assuming you specified a filename, you'll end up with a CSV file with contents like so: + +``` +Validator,PT_VERSION,2.2.0 +Validator,ROCM_VERSION,6.0.0.0-12969-1544e39 +Validator,HIPBLASLT_VERSION,0.6.0-a9c5cc7 +Validator,ROCBLAS_VERSION,4.0.0-72e57364-dirty +GemmTunableOp_float_NT,nt_25088_4096_64,1219,1.262 +GemmTunableOp_float_NT,nt_4096_4096_64,1216,0.033 +``` + +Note the "Validator" lines. If you change a library verison, or rocm version, or pytorch version, TunableOp will detect +this and not load the tunings because they are likely affected by other software changes. + +The remaining lines are the tuned solutions for each TunableOp encountered during your execution. Each line consists of +4 comma-separated fields: operator name, operator parameters, solution name, and average execution time. The execution +time is an optional field. The CSV file can be edited, but with caution. For example, the solution name (field 3) can be +changed to "Default" and it will fall back to the original PyTorch untuned implementation. Or, in the case of ROCm's +hipBLAS or hipBLASLt libraries, if you know the specific solution index you can override the solution that TunableOp +selected by replacing the value. The operator name and parameters (fields 1 and 2) are internally named and should not +be modified. In the case of GemmTunableOp, field 1 indicates the datatype and whether the inputs are transposed (T) or +not (N) and field 2 indicates the M, N, K input shapes. diff --git a/aten/src/ATen/cuda/tunable/StreamTimer.cpp b/aten/src/ATen/cuda/tunable/StreamTimer.cpp new file mode 100644 index 0000000000000..1407c32dbb352 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/StreamTimer.cpp @@ -0,0 +1,43 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#include + +#include +#include +#include + +namespace at::cuda::tunable { + +StreamTimer::StreamTimer() { + AT_CUDA_CHECK(cudaEventCreate(&start_)); + AT_CUDA_CHECK(cudaEventCreate(&end_)); +} + +StreamTimer::~StreamTimer() { +} + +void StreamTimer::Start() { + AT_CUDA_CHECK(cudaDeviceSynchronize()); + AT_CUDA_CHECK(cudaEventRecord(start_, at::cuda::getCurrentCUDAStream())); +} + +void StreamTimer::End() { + AT_CUDA_CHECK(cudaEventRecord(end_, at::cuda::getCurrentCUDAStream())); + AT_CUDA_CHECK(cudaEventSynchronize(end_)); +} + +float StreamTimer::Duration() { + float time; + // time is in ms with a resolution of 1 us + AT_CUDA_CHECK(cudaEventElapsedTime(&time, start_, end_)); + return time; +} + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/StreamTimer.h b/aten/src/ATen/cuda/tunable/StreamTimer.h new file mode 100644 index 0000000000000..69889cbbcbfc6 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/StreamTimer.h @@ -0,0 +1,34 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include + +namespace at::cuda::tunable { + +class StreamTimer : public ITimer { + public: + StreamTimer(); + virtual ~StreamTimer(); + + void Start() override; + + void End() override; + + float Duration() override; + + private: + cudaEvent_t start_; + cudaEvent_t end_; +}; + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/Tunable.cpp b/aten/src/ATen/cuda/tunable/Tunable.cpp new file mode 100644 index 0000000000000..22bde7f4c4270 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/Tunable.cpp @@ -0,0 +1,564 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#include + +#include +#include +#include +#include +#include + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::cuda::tunable { + +namespace { + +TuningContext tuning_context; + +} // anonymous namespace + +TuningContext* getTuningContext() { + return &tuning_context; +} + +std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry) { + return stream << entry.key_ << "," << entry.time_; +} + +// TuningResultsManager + +KernelMap TuningResultsManager::Lookup(const std::string& op_signature) { + std::scoped_lock l{lock_}; + auto it = results_.find(op_signature); + if (it == results_.cend()) { + return {}; + } + return it->second; // copied +} + +ResultEntry TuningResultsManager::Lookup(const std::string& op_signature, const std::string& params_signature) { + std::scoped_lock l{lock_}; + auto kernel_map_it = results_.find(op_signature); + if (kernel_map_it == results_.cend()) { + TUNABLE_LOG("missing op_signature, returning null ResultEntry"); + return ResultEntry::Null(); + } + + const auto& km = kernel_map_it->second; + auto it = km.find(params_signature); + if (it == km.cend()) { + TUNABLE_LOG("missing params_signature, returning null ResultEntry"); + return ResultEntry::Null(); + } + return it->second; +} + +inline void TuningResultsManager::AddImpl(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best, + KernelMap& kernel_map) { + auto it = kernel_map.find(params_signature); + if (it != kernel_map.end()) { + if (it->second != best) { + TUNABLE_LOG(op_signature, "(", params_signature, ") already has a best kernel ", + "id=", it->second, " selected, want to add a different best kernel ", best, + ", the new kernel id will be ignored."); + } + return; + } + + TUNABLE_LOG(op_signature, "(", params_signature, ") -> ", best); + kernel_map.emplace(params_signature, best); +} + +void TuningResultsManager::Add(const std::string& op_signature, const std::string& params_signature, ResultEntry best) { + std::scoped_lock l{lock_}; + + auto it = results_.find(op_signature); + if (it == results_.end()) { + it = results_.insert({op_signature, {}}).first; + } + + AddImpl(op_signature, params_signature, best, it->second); +} + +void TuningResultsManager::Delete(const std::string& op_signature, const std::string& params_signature) { + std::scoped_lock l{lock_}; + + auto it = results_.find(op_signature); + if (it == results_.end()) { + return; + } + + auto it2 = it->second.find(params_signature); + if (it2 == it->second.end()) { + return; + } + + TUNABLE_LOG(op_signature, "(", params_signature, ")"); + it->second.erase(it2); +} + +inline void TuningResultsManager::DisjointMergeImpl( + const std::string& op_signature, + const KernelMap& kernel_map, + /*out*/ std::unordered_map& results) { + auto it = results.find(op_signature); + if (it == results.end()) { + for (const auto& [param_sig, kernel_id] : kernel_map) { + TUNABLE_LOG(op_signature, "(", param_sig, ") -> ", kernel_id); + } + results[op_signature] = kernel_map; + return; + } + + for (const auto& [params_signature, best] : kernel_map) { + AddImpl(op_signature, params_signature, best, it->second); + } +} + +void TuningResultsManager::Load(const std::unordered_map& results_to_load) { + TUNABLE_LOG("Loading results"); + std::scoped_lock l{lock_}; + for (const auto& [op_signature, kernel_map] : results_to_load) { + DisjointMergeImpl(op_signature, kernel_map, results_); + } +} + +ResultsMap TuningResultsManager::Dump() { + std::scoped_lock l{lock_}; + return results_; +} + +void TuningResultsManager::DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map) { + std::scoped_lock l{lock_}; + DisjointMergeImpl(op_signature, kernel_map, results_); +} + +size_t TuningResultsManager::GetSize() { + size_t size = 0; + std::scoped_lock l{lock_}; + for (const auto& [op_signature, kernel_map] : results_) { + size += kernel_map.size(); + } + return size; +} + +// TuningResultsValidator + +TuningResultsValidator::TuningResultsValidator() { + RegisterValidator( + "PT_VERSION", + [this]() { return GetPyTorchVersion(); }, + [this](auto&& k) { return ValidatePyTorchVersion(std::forward(k)); }); +} + +std::unordered_map TuningResultsValidator::GetAllValidators() const { + std::unordered_map ret; + for (const auto& [key, get_validate_func_pair] : validators_) { + const GetFunc& getter = get_validate_func_pair.first; + ret[key] = getter(); + } + return ret; +} + +static bool CheckMandatoryKeys( + const TuningResultsValidator::GetValidateFuncs& gv_funcs, + const std::unordered_map& to_check) { + bool passed = true; + for (const auto& k : TuningResultsValidator::mandatory_keys) { + if (gv_funcs.find(k) == gv_funcs.end()) { + passed = false; + TUNABLE_LOG("key=\"", k, "\" is not registered for Get and Validate. "); + } + + if (to_check.find(k) == to_check.end()) { + passed = false; + TUNABLE_LOG("key=\"", k, "\" is not provided for validation. "); + } + } + return passed; +} + +static bool CheckKeysMatching( + const TuningResultsValidator::GetValidateFuncs& gv_funcs, + const std::unordered_map& to_check) { + auto get_keys = [](const auto& it) -> std::string { return it.first; }; + std::vector required_keys; + std::vector provided_keys; + std::transform(gv_funcs.cbegin(), gv_funcs.cend(), std::back_inserter(required_keys), get_keys); + std::transform(to_check.cbegin(), to_check.cend(), std::back_inserter(provided_keys), get_keys); + std::sort(required_keys.begin(), required_keys.end()); + std::sort(provided_keys.begin(), provided_keys.end()); + + std::unordered_set intersection; + std::set_intersection(required_keys.cbegin(), required_keys.cend(), + provided_keys.cbegin(), provided_keys.cend(), + std::inserter(intersection, intersection.end())); + bool matched = true; + if (intersection.size() != required_keys.size()) { + matched = false; + for (const auto& k : required_keys) { + if (intersection.find(k) == intersection.end()) { + TORCH_WARN("Unmatched validator: \"", k, "\" is required, but the tuning results does not provide it. "); + } + } + } + if (intersection.size() != provided_keys.size()) { + matched = false; + for (const auto& k : provided_keys) { + if (intersection.find(k) == intersection.end()) { + TORCH_WARN("Unmatched validator: \"", k, "\" is provided, but pytorch is unable to consume it. "); + } + } + } + return matched; +} + +TuningStatus TuningResultsValidator::ValidateAll( + const std::unordered_map& to_validate) const { + if (!CheckMandatoryKeys(validators_, to_validate)) { + return FAIL; + } + if (!CheckKeysMatching(validators_, to_validate)) { + return FAIL; + } + + for (const auto& [key, value] : to_validate) { + const auto& it = validators_.find(key); + if (it == validators_.cend()) { + TORCH_WARN("Failed to lookup validator using key ", key); + for (const auto& [key2, val2] : validators_) { + TORCH_WARN("available key ", key2); + } + return FAIL; + } + const ValidateFunc& validator = it->second.second; + if (validator(value) != OK) { + TORCH_WARN("Failed validator: ", key); + return FAIL; + } + } + + return OK; +} + +void TuningResultsValidator::RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf) { + if (validators_.find(key) != validators_.end()) { + TORCH_WARN("Attempting to re-register validator with key ", key); + } + else { + validators_[key] = std::make_pair(gf, vf); + } +} + +std::string TuningResultsValidator::GetPyTorchVersion() const { + return TORCH_VERSION; +} + +TuningStatus TuningResultsValidator::ValidatePyTorchVersion(const std::string& value) const { + if (value == GetPyTorchVersion()) { + return OK; + } + return FAIL; +} + +// TuningContext + +TuningContext::TuningContext() : + enable_{false}, + tuning_enable_{true}, + manager_initialized_{false}, + max_tuning_duration_ms_{30}, + max_tuning_iterations_{100}, + max_warmup_duration_ms_{0}, + max_warmup_iterations_{0}, + filename_{}, + results_count_from_input_file_{0} +{ +} + +TuningContext::~TuningContext() { + if (!manager_initialized_) { + // TuningResultsManager was never initialized, no tuning requested or performed. + // This can happen in a DDP job where a python process spawns other workers + // but doesn't do any computation itself. + return; + } + auto filename = GetFilename(); + if (IsTunableOpEnabled() && IsTuningEnabled() && !filename.empty()) { + if (results_count_from_input_file_ < GetTuningResultsManager().GetSize()) { + if (results_count_from_input_file_ > 0) { + TUNABLE_LOG("additional tuning results available, rewriting file ", filename); + } + else { + TUNABLE_LOG("writing file ", filename); + } + if (!WriteFile(filename)) { + TUNABLE_LOG("failed to write file ", filename); + } + } + } +} + +void TuningContext::EnableTunableOp() { + TUNABLE_LOG("Enable TunableOp"); + enable_ = true; +} + +void TuningContext::DisableTunableOp() { + TUNABLE_LOG("Disable TunableOp"); + enable_ = false; +} + +bool TuningContext::IsTunableOpEnabled() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_ENABLED"); + if (env != nullptr && strcmp(env, "1") == 0) { + //TUNABLE_LOG("PYTORCH_TUNABLEOP_ENABLED=1"); + return true; + } + return enable_; +} + +void TuningContext::EnableTuning() { + TUNABLE_LOG("Enable Tuning for TunableOp"); + tuning_enable_ = true; +} + +void TuningContext::DisableTuning() { + TUNABLE_LOG("Disable Tuning for TunableOp"); + tuning_enable_ = false; +} + +bool TuningContext::IsTuningEnabled() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_TUNING"); + if (env != nullptr && strcmp(env, "0") == 0) { + //TUNABLE_LOG("PYTORCH_TUNABLEOP_TUNING=1"); + return false; + } + return tuning_enable_; +} + +void TuningContext::SetMaxTuningDurationMs(int max_duration_ms) { + max_tuning_duration_ms_ = max_duration_ms; +} + +int TuningContext::GetMaxTuningDurationMs() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_DURATION_MS"); + if (env != nullptr) { + return atoi(env); + } + return max_tuning_duration_ms_; +} + +void TuningContext::SetMaxTuningIterations(int max_iter) { + max_tuning_iterations_ = max_iter; +} + +int TuningContext::GetMaxTuningIterations() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_TUNING_ITERATIONS"); + if (env != nullptr) { + return atoi(env); + } + return max_tuning_iterations_; +} + +void TuningContext::SetMaxWarmupDurationMs(int max_duration_ms) { + max_warmup_duration_ms_ = max_duration_ms; +} + +int TuningContext::GetMaxWarmupDurationMs() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_DURATION_MS"); + if (env != nullptr) { + return atoi(env); + } + return max_warmup_duration_ms_; +} + +void TuningContext::SetMaxWarmupIterations(int max_iter) { + max_warmup_iterations_ = max_iter; +} + +int TuningContext::GetMaxWarmupIterations() const { + static const char *env = std::getenv("PYTORCH_TUNABLEOP_MAX_WARMUP_ITERATIONS"); + if (env != nullptr) { + return atoi(env); + } + return max_warmup_iterations_; +} + +void TuningContext::EnableTunableOpAndTuning() { + EnableTunableOp(); + EnableTuning(); +} + +void TuningContext::DisableTunableOpAndTuning() { + DisableTunableOp(); + DisableTuning(); +} + +TuningResultsManager& TuningContext::GetTuningResultsManager() { + c10::call_once(manager_init_once_, [this]() { + manager_initialized_ = true; + if (GetFilename().empty()) { + // if SetFilename() was not already called, call it now with the default or env var + const char *env = std::getenv("PYTORCH_TUNABLEOP_FILENAME"); + std::string filename = (env == nullptr) ? "tunableop_results.csv" : env; + SetFilename(filename); + } + auto filename = GetFilename(); + if (!filename.empty()) { + ReadFile(filename); + // attempt immediately to open file for writing to catch errors early + std::ofstream file(filename, std::ios::out | std::ios::app); + if (!file.good()) { + TORCH_WARN("failed to open file '", filename, "' for writing; your tuning results will not be saved"); + } + } + }); + return manager_; +} + +TuningResultsValidator& TuningContext::GetTuningResultsValidator() { + return validator_; +} + +TuningResults TuningContext::GetTuningResults() { + TuningResults tr; + tr.validators = GetTuningResultsValidator().GetAllValidators(); + tr.results = GetTuningResultsManager().Dump(); + return tr; +} + +TuningStatus TuningContext::LoadTuningResults(const TuningResults& tr) { + TORCH_CHECK(GetTuningResultsValidator().ValidateAll(tr.validators)); + GetTuningResultsManager().Load(tr.results); + return OK; +} + +void TuningContext::SetFilename(const std::string& filename) { + filename_ = filename; + + if (filename_.empty()) { + return; + } + + // differentiate filename based on device ordinal to avoid + // use case of one process per device writing to same file + std::string device = c10::str(int(c10::cuda::current_device())); + + // does filename contain %d to insert device ordinal in specific location? + const std::string TOKEN("%d"); + std::size_t found = filename_.find(TOKEN); + if (found != std::string::npos) { + filename_.replace(found, TOKEN.length(), device); + } + else { + // no %d present, so append device ordinal before final '.' + found = filename_.rfind("."); + if (found != std::string::npos) { + filename_.insert(found, device); + } + else { + // all else fails, just append + filename_.append(device); + } + } +} + +std::string TuningContext::GetFilename() const { + return filename_; +} + +bool TuningContext::ReadFile(const std::string& filename) { + TUNABLE_LOG("reading tuning results from ", filename); + ResultsMap results; + std::unordered_map validators; + std::string line; + std::ifstream file(filename); + if (!file) { + TUNABLE_LOG("could not open ", filename, " for reading tuning results"); + return false; + } + while (std::getline(file, line)) { + if (line.empty()) { + continue; + } + std::string part; + std::vector parts; + std::stringstream line_as_stream(line); + while (std::getline(line_as_stream, part, ',')) { + parts.push_back(part); + } + if (parts[0] == "Validator" && parts.size() >= 3) { + validators[parts[1]] = parts[2]; + TUNABLE_LOG("Validator ", parts[1], "=", parts[2]); + } + else if (parts.size() >= 4) { + results[parts[0]].emplace(parts[1], ResultEntry(parts[2], atof(parts[3].c_str()))); + } + else if (parts.size() >= 3) { + // the timestamp from the file is optional + results[parts[0]].emplace(parts[1], ResultEntry(parts[2], 0)); + } + else { + TUNABLE_LOG("could not parse line: ", line); + } + } + if (GetTuningResultsValidator().ValidateAll(validators) != FAIL) { + manager_.Load(results); + results_count_from_input_file_ = manager_.GetSize(); + } + else { + TUNABLE_LOG("results validator check failed"); + return false; + } + return true; +} + +bool TuningContext::WriteFile(const std::string& filename) { + std::ofstream file(filename, std::ios::out | std::ios::trunc); + if (!file.good()) { + TUNABLE_LOG("error opening tuning results file for writing ", filename); + return false; + } + auto validators = GetTuningResultsValidator().GetAllValidators(); + for (const auto& [key, val] : validators) { + file << "Validator," << key << "," << val << std::endl; + } + auto results = GetTuningResultsManager().Dump(); + for (const auto& [op_sig, kernelmap] : results) { + for (const auto& [param_sig, result] : kernelmap) { + file << op_sig << "," << param_sig << "," << result << std::endl; + } + } + file.close(); + return true; +} + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/Tunable.h b/aten/src/ATen/cuda/tunable/Tunable.h new file mode 100644 index 0000000000000..eb849a213fe5a --- /dev/null +++ b/aten/src/ATen/cuda/tunable/Tunable.h @@ -0,0 +1,205 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::cuda::tunable { + +static void TunableLog(const std::string& msg) { + static const char *env = getenv("PYTORCH_TUNABLEOP_VERBOSE"); + if (env != nullptr && strcmp(env, "1") == 0) { + std::cerr << msg << std::endl; + } +} +#define TUNABLE_LOG(...) TunableLog(c10::str(__VA_ARGS__)) + +enum TuningStatus { + OK = 0, + FAIL = 1, + UNSUPPORTED = 2, +}; + +// Mapping from params signature to kernel id +class ResultEntry { + public: + explicit ResultEntry(const std::string& key, double time) : key_(key), time_(time) {} + bool operator==(const ResultEntry& other) { return key_ == other.key_; } + bool operator!=(const ResultEntry& other) { return key_ != other.key_; } + operator std::string () { return key_; } + friend std::ostream& operator<<(std::ostream& stream, const ResultEntry& entry); + static ResultEntry Null() { return ResultEntry("Null", 0.0); } + static ResultEntry Default() { return ResultEntry("Default", 0.0); } + + private: + std::string key_; + double time_; +}; + +typedef std::unordered_map KernelMap; +typedef std::unordered_map ResultsMap; + +struct TuningResults { + // Validates if these results are compatible with the libraries + std::unordered_map validators; + + // Mapping from Callable signature to Callable's tuning result + ResultsMap results; +}; + +class TuningResultsManager { + public: + TuningResultsManager() = default; + ~TuningResultsManager() = default; + + KernelMap Lookup(const std::string& op_signature); + + ResultEntry Lookup(const std::string& op_signature, const std::string& params_signature); + + inline void AddImpl(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best, + KernelMap& kernel_map); + + void Add(const std::string& op_signature, + const std::string& params_signature, + ResultEntry best); + + void Delete(const std::string& op_signature, const std::string& params_signature); + + inline void DisjointMergeImpl( + const std::string& op_signature, + const KernelMap& kernel_map, + /*out*/ ResultsMap& results); + + void Load(const ResultsMap& results_to_load); + + ResultsMap Dump(); + + void DisjointMerge(const std::string& op_signature, const KernelMap& kernel_map); + + size_t GetSize(); + + private: + std::mutex lock_; + ResultsMap results_; +}; + +class TuningResultsValidator { + public: + using GetFunc = std::function; + using ValidateFunc = std::function; + using GetValidateFuncs = std::unordered_map>; + + TuningResultsValidator(); + ~TuningResultsValidator() = default; + + std::unordered_map GetAllValidators() const; + TuningStatus ValidateAll(const std::unordered_map& to_validate) const; + void RegisterValidator(const std::string& key, const GetFunc& gf, const ValidateFunc& vf); + + protected: + std::string GetPyTorchVersion() const; + TuningStatus ValidatePyTorchVersion(const std::string& value) const; + + public: + static constexpr const std::array mandatory_keys{"PT_VERSION"}; + + private: + GetValidateFuncs validators_; +}; + +class TuningContext { + public: + TuningContext(); + ~TuningContext(); + TuningContext(TuningContext &) = delete; + TuningContext(TuningContext &&) = delete; + TuningContext &operator=(TuningContext &) = delete; + TuningContext &operator=(TuningContext &&) = delete; + + void EnableTunableOp(); + void DisableTunableOp(); + bool IsTunableOpEnabled() const; + + void EnableTuning(); + void DisableTuning(); + bool IsTuningEnabled() const; + + void SetMaxTuningDurationMs(int max_duration_ms); + int GetMaxTuningDurationMs() const; + + void SetMaxTuningIterations(int max_iter); + int GetMaxTuningIterations() const; + + void SetMaxWarmupDurationMs(int max_duration_ms); + int GetMaxWarmupDurationMs() const; + + void SetMaxWarmupIterations(int max_iter); + int GetMaxWarmupIterations() const; + + void EnableTunableOpAndTuning(); + void DisableTunableOpAndTuning(); + + TuningResultsManager& GetTuningResultsManager(); + + TuningResultsValidator& GetTuningResultsValidator(); + + TuningResults GetTuningResults(); + + TuningStatus LoadTuningResults(const TuningResults& tr); + + void SetFilename(const std::string& filename); + std::string GetFilename() const; + + protected: + bool ReadFile(const std::string& filename); + bool WriteFile(const std::string& filename); + + private: + bool enable_; + bool tuning_enable_; + bool manager_initialized_; + int max_tuning_duration_ms_; + int max_tuning_iterations_; + int max_warmup_duration_ms_; + int max_warmup_iterations_; + mutable TuningResultsManager manager_; + mutable c10::once_flag manager_init_once_; + TuningResultsValidator validator_; + std::string filename_; + size_t results_count_from_input_file_; +}; + +TuningContext* getTuningContext(); + +class ITimer { + public: + ITimer() = default; + virtual ~ITimer() = default; + + virtual void Start() = 0; + virtual void End() = 0; + + /// Computes the elapsed time in milliseconds between Start() and End() + virtual float Duration() = 0; +}; + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/TunableGemm.h b/aten/src/ATen/cuda/tunable/TunableGemm.h new file mode 100644 index 0000000000000..3b5e7e0903c89 --- /dev/null +++ b/aten/src/ATen/cuda/tunable/TunableGemm.h @@ -0,0 +1,368 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#ifdef USE_ROCM +#if ROCM_VERSION >= 50700 +#include +#endif +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef USE_ROCM +#include +#endif + +#define STRINGIFY(s) #s +#define XSTRINGIFY(s) STRINGIFY(s) + +namespace at::cuda::tunable { + +template +class DefaultGemmOp : public Callable> { + public: + TuningStatus Call(const GemmParams* params) override { + at::cuda::blas::gemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, + params->b, params->ldb, + params->beta, + params->c, params->ldc); + return OK; + } +}; + +template +class DefaultGemmStridedBatchedOp : public Callable> { + public: + TuningStatus Call(const GemmStridedBatchedParams* params) override { + at::cuda::blas::bgemm_internal( + params->transa, params->transb, + params->m, params->n, params->k, + params->alpha, + params->a, params->lda, params->stride_a, + params->b, params->ldb, params->stride_b, + params->beta, + params->c, params->ldc, params->stride_c, + params->batch); + return OK; + } +}; + +template +class DefaultScaledGemmOp : public Callable> { + public: + TuningStatus Call(const ScaledGemmParams* params) override { + at::cuda::blas::scaled_gemm( + params->transa, + params->transb, + params->m, + params->n, + params->k, + params->a, + params->a_scale_ptr, + params->lda, + params->a_dtype, + params->b, + params->b_scale_ptr, + params->ldb, + params->b_dtype, + params->bias_ptr, + params->bias_dtype, + params->c, + params->c_scale_ptr, + params->ldc, + params->c_dtype, + params->amax_ptr, + params->use_fast_accum); + return OK; + } +}; + +template +inline bool IsZero(T v) { + return v == 0.0f; +} + +template <> +inline bool IsZero(BFloat16 v) { + return v.x == 0; +} + +template <> +inline bool IsZero(Half v) { + return float(v) == 0.0f; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0; +} + +template <> +inline bool IsZero(c10::complex v) { + return v == 0.0f; +} + +template +inline std::string TypeName(T v) { + return "unknown"; +} + +template <> +inline std::string TypeName(float v) { + return "float"; +} + +template <> +inline std::string TypeName(double v) { + return "double"; +} + +template <> +inline std::string TypeName(BFloat16 v) { + return "BFloat16"; +} + +template <> +inline std::string TypeName(Half v) { + return "Half"; +} + +template <> +inline std::string TypeName(Float8_e4m3fn v) { + return "Float8_e4m3fn"; +} + +template <> +inline std::string TypeName(Float8_e5m2 v) { + return "Float8_e5m2"; +} + +template <> +inline std::string TypeName(Float8_e4m3fnuz v) { + return "Float8_e4m3fnuz"; +} + +template <> +inline std::string TypeName(Float8_e5m2fnuz v) { + return "Float8_e5m2fnuz"; +} + +template <> +inline std::string TypeName(c10::complex v) { + return "c10::complex"; +} + +template <> +inline std::string TypeName(c10::complex v) { + return "c10::complex"; +} + + +template +class GemmTunableOp : public TunableOp, StreamTimer> { + public: + GemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + +#ifdef USE_ROCM + for (auto&& [name, op] : GetRocBlasGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + + if (validators.find("ROCM_VERSION") == validators.end()) { + std::string rocm_version = ROCM_BUILD_INFO; + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "ROCM_VERSION", + [rocm_version]() { return rocm_version; }, + [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; }); + } + + if (validators.find("GCN_ARCH_NAME") == validators.end()) { + std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName; + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "GCN_ARCH_NAME", + [gcn_arch_name]() { return gcn_arch_name; }, + [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; }); + } + + if (validators.find("ROCBLAS_VERSION") == validators.end()) { + std::string rocblas_version = c10::str( + XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".", + XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".", + XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-", + XSTRINGIFY(ROCBLAS_VERSION_TWEAK)); + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "ROCBLAS_VERSION", + [rocblas_version]() { return rocblas_version; }, + [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; }); + } +#endif + +#if defined(USE_ROCM) && ROCM_VERSION >= 50700 + static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env == nullptr || strcmp(env, "1") == 0) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { + std::string hipblaslt_version = c10::str( + XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-", + XSTRINGIFY(HIPBLASLT_VERSION_TWEAK)); + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "HIPBLASLT_VERSION", + [hipblaslt_version]() { return hipblaslt_version; }, + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } + } +#endif + } + + std::string Signature() override { + return c10::str("GemmTunableOp_", TypeName(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class GemmStridedBatchedTunableOp : public TunableOp, StreamTimer> { + public: + GemmStridedBatchedTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + +#ifdef USE_ROCM + for (auto&& [name, op] : GetRocBlasGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + + if (validators.find("ROCM_VERSION") == validators.end()) { + std::string rocm_version = ROCM_BUILD_INFO; + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "ROCM_VERSION", + [rocm_version]() { return rocm_version; }, + [rocm_version](auto&& k) { return rocm_version == k ? OK : FAIL; }); + } + + if (validators.find("GCN_ARCH_NAME") == validators.end()) { + std::string gcn_arch_name = at::cuda::getCurrentDeviceProperties()->gcnArchName; + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "GCN_ARCH_NAME", + [gcn_arch_name]() { return gcn_arch_name; }, + [gcn_arch_name](auto&& k) { return gcn_arch_name == k ? OK : FAIL; }); + } + + if (validators.find("ROCBLAS_VERSION") == validators.end()) { + std::string rocblas_version = c10::str( + XSTRINGIFY(ROCBLAS_VERSION_MAJOR), ".", + XSTRINGIFY(ROCBLAS_VERSION_MINOR), ".", + XSTRINGIFY(ROCBLAS_VERSION_PATCH), "-", + XSTRINGIFY(ROCBLAS_VERSION_TWEAK)); + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "ROCBLAS_VERSION", + [rocblas_version]() { return rocblas_version; }, + [rocblas_version](auto&& k) { return rocblas_version == k ? OK : FAIL; }); + } +#endif + +#if defined(USE_ROCM) && ROCM_VERSION >= 50700 + static const char *env = std::getenv("PYTORCH_TUNABLEOP_HIPBLASLT_ENABLED"); + if (env == nullptr || strcmp(env, "1") == 0) { + // disallow tuning of hipblaslt with c10::complex + if constexpr ( + !std::is_same_v> && + !std::is_same_v>) { + for (auto&& [name, op] : GetHipBlasLtGemmStridedBatchedTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + } + + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { + std::string hipblaslt_version = c10::str( + XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-", + XSTRINGIFY(HIPBLASLT_VERSION_TWEAK)); + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "HIPBLASLT_VERSION", + [hipblaslt_version]() { return hipblaslt_version; }, + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } + } +#endif + } + + std::string Signature() override { + return c10::str("GemmStridedBatchedTunableOp_", TypeName(T{}), "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +template +class ScaledGemmTunableOp : public TunableOp, StreamTimer> { + public: + ScaledGemmTunableOp() { + this->RegisterOp(std::string("Default"), std::make_unique>()); + + auto validators = getTuningContext()->GetTuningResultsValidator().GetAllValidators(); + +#if defined(USE_ROCM) && ROCM_VERSION >= 50700 + for (auto&& [name, op] : GetHipBlasLtScaledGemmTypeStringAndOps()) { + this->RegisterOp(std::move(name), std::move(op)); + } + + if (validators.find("HIPBLASLT_VERSION") == validators.end()) { + std::string hipblaslt_version = c10::str( + XSTRINGIFY(HIPBLASLT_VERSION_MAJOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_MINOR), ".", + XSTRINGIFY(HIPBLASLT_VERSION_PATCH), "-", + XSTRINGIFY(HIPBLASLT_VERSION_TWEAK)); + getTuningContext()->GetTuningResultsValidator().RegisterValidator( + "HIPBLASLT_VERSION", + [hipblaslt_version]() { return hipblaslt_version; }, + [hipblaslt_version](auto&& k) { return hipblaslt_version == k ? OK : FAIL; }); + } +#endif + } + + std::string Signature() override { + return c10::str("ScaledGemmTunableOp", + "_", TypeName(AT{}), + "_", TypeName(BT{}), + "_", TypeName(CT{}), + "_", BlasOpToString(ALayout), BlasOpToString(BLayout)); + } +}; + +#undef XSTRINGIFY +#undef STRINGIFY + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cuda/tunable/TunableOp.h b/aten/src/ATen/cuda/tunable/TunableOp.h new file mode 100644 index 0000000000000..65257974ab0cd --- /dev/null +++ b/aten/src/ATen/cuda/tunable/TunableOp.h @@ -0,0 +1,242 @@ +// Original TunableOp is from onnxruntime. +// https://github.com/microsoft/onnxruntime/blob/main/onnxruntime/core/framework/tunable.h +// https://github.com/microsoft/onnxruntime/tree/main/onnxruntime/core/providers/rocm/tunable +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT license. +// +// Adapting TunableOp into PyTorch +// Copyright (c) Advanced Micro Devices, Inc. +// +#pragma once + +#include +#include + +#ifndef _WIN32 +#include +#endif + +#include +#include +#include +#include + +namespace at::cuda::tunable { + +template +class Callable { + public: + Callable() = default; + Callable(Callable&&) = default; + virtual ~Callable() = default; + virtual TuningStatus Call(const ParamsT*) { + return FAIL; + } + virtual TuningStatus IsSupported(const ParamsT* params) { + return Call(params); + } +}; + +template +class TunableOp { + public: + TunableOp() = default; + TunableOp(TunableOp&&) = default; + virtual ~TunableOp() = default; + + TuningStatus operator()(const ParamsT* params) { + ResultEntry result = ResultEntry::Null(); + TuningContext* ctx = getTuningContext(); + if (ctx->IsTunableOpEnabled()) { + auto& mgr = ctx->GetTuningResultsManager(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + result = mgr.Lookup(op_sig, params_sig); + // If there is not previous tuning result been found, we do the tuning iff tuning is enabled + if (result == ResultEntry::Null() && ctx->IsTuningEnabled()) { + result = FindFastest(params); + mgr.Add(op_sig, params_sig, result); + } + } + else { + result = ResultEntry::Default(); + } + if (result == ResultEntry::Null()) { + TUNABLE_LOG("no result, using default"); + result = ResultEntry::Default(); + } + auto iter = ops_.find(result); + TORCH_CHECK(iter != ops_.end()); + return iter->second->Call(params); + } + + virtual std::string Signature() { + // According to C++17 standard https://wg21.link/n4659 section 15.7.4 + // > if the operand of typeid refers to the + // > object under construction or destruction, typeid yields the std::type_info object representing the constructor + // > or destructorā€™s class. + // So delay the op signature generation. + c10::call_once(signature_init_once_, [this]() { signature_ = CreateSignature(); }); + return signature_; + } + + protected: + void RegisterOp(const std::string& name, std::unique_ptr> op) { + this->op_names_.emplace_back(name); + this->ops_.emplace(name, std::move(op)); + } + + private: + static void WarmUp(Callable *op, ParamsT* param, size_t num_iter) { + for (size_t i = 0; i < num_iter; i++) { + TORCH_CHECK(op->Call(param) == OK); + } + } + + static double Profile(Callable *op, ParamsT* param, size_t num_iter) { + TimerT timer{}; + timer.Start(); + for (size_t i = 0; i < num_iter; i++) { + TORCH_CHECK(op->Call(param) == OK); + } + timer.End(); + return timer.Duration() / num_iter; + } + + protected: + bool IsNumericsCheckEnabled() { + static const char *env = getenv("PYTORCH_TUNABLEOP_NUMERICAL_CHECK"); + if (env != nullptr && strcmp(env, "0") == 0) { + return false; + } + return true; + } + + virtual ResultEntry FindFastest(const ParamsT* params) { + TuningContext* ctx = getTuningContext(); + auto op_sig = Signature(); + auto params_sig = params->Signature(); + TUNABLE_LOG("finding fastest for ", op_sig, '(', params_sig, ')', " out of ", op_names_.size(), " candidates"); + auto min_duration_ms = std::numeric_limits::infinity(); + std::string id_name = "Default"; + + // calcaulte a reference answer for numerical check + ParamsT* reference_params = params->DeepCopy(); + TORCH_CHECK(ops_[ResultEntry::Default()]->Call(reference_params) == OK); + + // need a copy of params to reuse + ParamsT* reusable_params = params->DeepCopy(); + + for (size_t i = 0; i < op_names_.size(); i++) { + auto* candidate = ops_[op_names_[i]].get(); // borrow pointer + auto status = candidate->Call(reusable_params); + if (status != OK) { + TUNABLE_LOG("ā”œā”€ā”€unsupported id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + if (IsNumericsCheckEnabled()) { + ParamsT* numerical_params = params->DeepCopy(); + WarmUp(candidate, numerical_params, 1); + status = reference_params->NumericalCheck(numerical_params); + numerical_params->Delete(); + if (status != OK) { + TUNABLE_LOG("ā”œā”€ā”€numerics check failed for id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + } + + // collect a small profile + constexpr const int approx_num_iter = 3; + auto approx_duration = Profile(candidate, reusable_params, approx_num_iter); + // bail if too slow + if (approx_duration > 2 * min_duration_ms) { + TUNABLE_LOG("ā”œā”€ā”€skip slow instance id=", i, ", ", op_sig, '(', params_sig, ") ", op_names_[i]); + continue; + } + + // for warmup does user set max duration, max iters, or both? + double max_warmup_duration = ctx->GetMaxWarmupDurationMs(); + int max_warmup_iter = ctx->GetMaxWarmupIterations(); + int warmup_iter = 1; // default + if (max_warmup_duration > 0) { + int duration_iters = max_warmup_duration / approx_duration; + if (max_warmup_iter > 0) { + warmup_iter = std::min(max_warmup_iter, duration_iters); + } + else { + warmup_iter = duration_iters; + } + } + else if (max_warmup_iter > 0) { + warmup_iter = max_warmup_iter; + } + + // for tuning does user set max duration, max iters, or both? + double max_tuning_duration = ctx->GetMaxTuningDurationMs(); + int max_tuning_iter = ctx->GetMaxTuningIterations(); + int tuning_iter = 100; // default + if (max_tuning_duration > 0) { + int duration_iters = max_tuning_duration / approx_duration; + if (max_tuning_iter > 0) { + tuning_iter = std::min(max_tuning_iter, duration_iters); + } + else { + tuning_iter = duration_iters; + } + } + else if (max_tuning_iter > 0) { + tuning_iter = max_tuning_iter; + } + + // do the full warmup followed by tuning + double warmup_ms = warmup_iter * approx_duration; + double tuning_ms = tuning_iter * approx_duration; + TUNABLE_LOG("ā”œā”€ā”€tuning using " + "warmup iters ", warmup_iter, " [", warmup_ms, " ms] " + "and tuning iters ", tuning_iter, " [", tuning_ms, " ms] ", + "instance id=", i, ", ", op_sig, "(", params_sig, ") ", op_names_[i]); + WarmUp(candidate, reusable_params, warmup_iter); + auto duration_ms = Profile(candidate, reusable_params, tuning_iter); + if (duration_ms < min_duration_ms) { + TUNABLE_LOG("ā”œā”€ā”€found better instance id=", i, ". " , duration_ms, "ms. ", op_names_[i]); + min_duration_ms = duration_ms; + id_name = op_names_[i]; + } + } + + reusable_params->Delete(); + reference_params->Delete(); + + TUNABLE_LOG("ā””ā”€ā”€found fastest for ", op_sig, '(', params_sig, ") ", id_name); + return ResultEntry(id_name, min_duration_ms); + } + + private: + std::string CreateSignature() { +#ifndef _WIN32 + const auto* name = typeid(*this).name(); + char buf[256]; + size_t buf_len = 256; + abi::__cxa_demangle(name, buf, &buf_len, nullptr); + buf[255] = '\0'; + return buf; +#else + return typeid(*this).name(); +#endif + } + + mutable c10::once_flag signature_init_once_; + std::string signature_; + + std::unordered_map>> ops_; + std::vector op_names_; +}; + +struct OpParams { + OpParams() {} + virtual ~OpParams() = default; + virtual std::string Signature() const = 0; +}; + +} // namespace at::cuda::tunable diff --git a/aten/src/ATen/cudnn/Descriptors.h b/aten/src/ATen/cudnn/Descriptors.h index 694e93216b7a8..79a2fe58ad007 100644 --- a/aten/src/ATen/cudnn/Descriptors.h +++ b/aten/src/ATen/cudnn/Descriptors.h @@ -210,9 +210,7 @@ struct TORCH_CUDA_CPP_API ConvolutionDescriptor if(dataType == CUDNN_DATA_HALF) { AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); } else if (dataType == CUDNN_DATA_FLOAT && !allow_tf32) { -#if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000 AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_FMA_MATH)); -#endif } } }; @@ -304,13 +302,9 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor< if (input_type == CUDNN_DATA_HALF) { cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_TENSOR_OP_MATH); } -#endif -#if !defined(USE_CUDNN_RNN_V8_API) && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8000 else if (input_type == CUDNN_DATA_FLOAT && !allow_tf32) { cudnnSetRNNMatrixMathType(mut_desc(), CUDNN_FMA_MATH); } -#endif -#ifndef USE_CUDNN_RNN_V8_API else { // Technically, as the default it's not necessary to explicitly // set this. @@ -318,6 +312,15 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor< } } #else + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + auto math_type = CUDNN_DEFAULT_MATH; + if (prop->major >= 7) { + if (input_type == CUDNN_DATA_HALF) { + math_type = CUDNN_TENSOR_OP_MATH; + } else if (!allow_tf32) { + math_type = CUDNN_FMA_MATH; + } + } AT_CUDNN_CHECK(cudnnSetRNNDescriptor_v8( mut_desc(), algo, @@ -327,7 +330,7 @@ struct TORCH_CUDA_CPP_API RNNDescriptor : public Descriptor< input_mode, input_type, datatype, - allow_tf32 ? CUDNN_DEFAULT_MATH : CUDNN_FMA_MATH, + math_type, input_size, hidden_size, proj_size ? proj_size : hidden_size, diff --git a/aten/src/ATen/cudnn/Handle.cpp b/aten/src/ATen/cudnn/Handle.cpp index ec0f416e85aea..f57744f129d98 100644 --- a/aten/src/ATen/cudnn/Handle.cpp +++ b/aten/src/ATen/cudnn/Handle.cpp @@ -34,7 +34,7 @@ using CudnnPoolType = at::cuda::DeviceThreadHandlePool +#include +namespace at { + +// AcceleratorHooksInterface is a shared interface provided by all +// accelerators to allow generic code. +// This inferface is hook-based as it corresponds to all the functions +// that are going to be called in a generic way from the CPU code. + +struct TORCH_API AcceleratorHooksInterface { + // This should never actually be implemented, but it is used to + // squelch -Werror=non-virtual-dtor + virtual ~AcceleratorHooksInterface() = default; + + // Whether the device at device_index is fully initialized or not. + virtual bool hasPrimaryContext(DeviceIndex device_index) const = 0; + + virtual DeviceIndex deviceCount() const { + return 0; + } + + virtual void setCurrentDevice(DeviceIndex device) const { + TORCH_CHECK(false, "Backend doesn't support setCurrentDevice()"); + } + + virtual DeviceIndex getCurrentDevice() const { + TORCH_CHECK(false, "Backend doesn't support getCurrentDevice()"); + return -1; + } + + virtual DeviceIndex exchangeDevice(DeviceIndex device) const { + TORCH_CHECK(false, "Backend doesn't support exchangeDevice()"); + return -1; + } + + virtual DeviceIndex maybeExchangeDevice(DeviceIndex device) const { + TORCH_CHECK(false, "Backend doesn't support maybeExchangeDevice()"); + return -1; + } +}; + +} // namespace at diff --git a/aten/src/ATen/detail/CUDAHooksInterface.h b/aten/src/ATen/detail/CUDAHooksInterface.h index 981fd1227a81d..860e49ff3d6f5 100644 --- a/aten/src/ATen/detail/CUDAHooksInterface.h +++ b/aten/src/ATen/detail/CUDAHooksInterface.h @@ -4,6 +4,8 @@ #include #include +#include + // Forward-declares at::Generator and at::cuda::NVRTC namespace at { struct Generator; @@ -57,10 +59,10 @@ constexpr const char* CUDA_HELP = // TODO: Consider putting the stub definitions in another class, so that one // never forgets to implement each virtual function in the real implementation // in CUDAHooks. This probably doesn't buy us much though. -struct TORCH_API CUDAHooksInterface { +struct TORCH_API CUDAHooksInterface : AcceleratorHooksInterface { // This should never actually be implemented, but it is used to // squelch -Werror=non-virtual-dtor - virtual ~CUDAHooksInterface() = default; + virtual ~CUDAHooksInterface() override = default; // Initialize THCState and, transitively, the CUDA state virtual void initCUDA() const { @@ -99,6 +101,10 @@ struct TORCH_API CUDAHooksInterface { return false; } + virtual bool hasCuBLASLt() const { + return false; + } + virtual bool hasROCM() const { return false; } @@ -107,7 +113,7 @@ struct TORCH_API CUDAHooksInterface { TORCH_CHECK(false, "NVRTC requires CUDA. ", CUDA_HELP); } - virtual bool hasPrimaryContext(DeviceIndex device_index) const { + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { TORCH_CHECK(false, "Cannot call hasPrimaryContext(", device_index, ") without ATen_cuda library. ", CUDA_HELP); } diff --git a/aten/src/ATen/detail/HIPHooksInterface.h b/aten/src/ATen/detail/HIPHooksInterface.h index 3ce351b623908..7f4862c408680 100644 --- a/aten/src/ATen/detail/HIPHooksInterface.h +++ b/aten/src/ATen/detail/HIPHooksInterface.h @@ -38,7 +38,7 @@ struct TORCH_API HIPHooksInterface { return false; } - virtual int64_t current_device() const { + virtual c10::DeviceIndex current_device() const { return -1; } diff --git a/aten/src/ATen/detail/MAIAHooksInterface.cpp b/aten/src/ATen/detail/MAIAHooksInterface.cpp new file mode 100644 index 0000000000000..e82ad8f677018 --- /dev/null +++ b/aten/src/ATen/detail/MAIAHooksInterface.cpp @@ -0,0 +1,29 @@ +#include + +#include +#include + +#include +#include + +namespace at { +namespace detail { + +// See getCUDAHooks for some more commentary +const MAIAHooksInterface& getMAIAHooks() { + static std::unique_ptr maia_hooks; + static c10::once_flag once; + c10::call_once(once, [] { + maia_hooks = MAIAHooksRegistry()->Create("MAIAHooks", {}); + if (!maia_hooks) { + maia_hooks = std::make_unique(); + } + }); + return *maia_hooks; +} +} // namespace detail + +// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) +C10_DEFINE_REGISTRY(MAIAHooksRegistry, MAIAHooksInterface, MAIAHooksArgs) + +} // namespace at diff --git a/aten/src/ATen/detail/MAIAHooksInterface.h b/aten/src/ATen/detail/MAIAHooksInterface.h new file mode 100644 index 0000000000000..ad4ef146eccd9 --- /dev/null +++ b/aten/src/ATen/detail/MAIAHooksInterface.h @@ -0,0 +1,31 @@ +#pragma once + +#include +#include + +// NB: Class must live in `at` due to limitations of Registry.h. +namespace at { + +struct TORCH_API MAIAHooksInterface { + // This should never actually be implemented, but it is used to + // squelch -Werror=non-virtual-dtor + virtual ~MAIAHooksInterface() = default; + + virtual std::string showConfig() const { + TORCH_CHECK(false, "Cannot query detailed MAIA version information."); + } +}; + +// NB: dummy argument to suppress "ISO C++11 requires at least one argument +// for the "..." in a variadic macro" +struct TORCH_API MAIAHooksArgs {}; + +TORCH_DECLARE_REGISTRY(MAIAHooksRegistry, MAIAHooksInterface, MAIAHooksArgs); +#define REGISTER_MAIA_HOOKS(clsname) \ + C10_REGISTER_CLASS(MAIAHooksRegistry, clsname, clsname) + +namespace detail { +TORCH_API const MAIAHooksInterface& getMAIAHooks(); +} // namespace detail + +} // namespace at diff --git a/aten/src/ATen/detail/MPSHooksInterface.h b/aten/src/ATen/detail/MPSHooksInterface.h index a982437505a4a..f82a802618d43 100644 --- a/aten/src/ATen/detail/MPSHooksInterface.h +++ b/aten/src/ATen/detail/MPSHooksInterface.h @@ -4,6 +4,7 @@ #include #include +#include #include #include @@ -11,13 +12,13 @@ namespace at { -struct TORCH_API MPSHooksInterface { +struct TORCH_API MPSHooksInterface : AcceleratorHooksInterface { // this fails the implementation if MPSHooks functions are called, but // MPS backend is not present. #define FAIL_MPSHOOKS_FUNC(func) \ TORCH_CHECK(false, "Cannot execute ", func, "() without MPS backend."); - virtual ~MPSHooksInterface() = default; + virtual ~MPSHooksInterface() override = default; // Initialize the MPS library state virtual void initMPS() const { @@ -86,7 +87,9 @@ struct TORCH_API MPSHooksInterface { virtual double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const { FAIL_MPSHOOKS_FUNC(__func__); } - + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { + FAIL_MPSHOOKS_FUNC(__func__); + } #undef FAIL_MPSHOOKS_FUNC }; diff --git a/aten/src/ATen/detail/MTIAHooksInterface.cpp b/aten/src/ATen/detail/MTIAHooksInterface.cpp index 6b69fdb03f3d8..0963881713861 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.cpp +++ b/aten/src/ATen/detail/MTIAHooksInterface.cpp @@ -8,19 +8,22 @@ namespace at { namespace detail { - -const MTIAHooksInterface &getMTIAHooks() { - static MTIAHooksInterface* MTIA_hooks = nullptr; +const MTIAHooksInterface& getMTIAHooks() { + static std::unique_ptr mtia_hooks = nullptr; static c10::once_flag once; c10::call_once(once, [] { - MTIA_hooks = - MTIAHooksRegistry()->Create("MTIAHooks", MTIAHooksArgs{}).release(); - if (!MTIA_hooks) { - MTIA_hooks = new MTIAHooksInterface(); + mtia_hooks = MTIAHooksRegistry()->Create("MTIAHooks", MTIAHooksArgs{}); + if (!mtia_hooks) { + mtia_hooks = std::make_unique(); } }); - return *MTIA_hooks; + return *mtia_hooks; +} + +bool isMTIAHooksBuilt() { + return MTIAHooksRegistry()->Has("MTIAHooks"); } + } // namespace detail C10_DEFINE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs) diff --git a/aten/src/ATen/detail/MTIAHooksInterface.h b/aten/src/ATen/detail/MTIAHooksInterface.h index f969beef7a36e..1da1bda4e6130 100644 --- a/aten/src/ATen/detail/MTIAHooksInterface.h +++ b/aten/src/ATen/detail/MTIAHooksInterface.h @@ -1,9 +1,13 @@ #pragma once +#include #include +#include #include +#include + #include namespace at { @@ -17,25 +21,72 @@ constexpr const char* MTIA_HELP = "this error has occurred because you are trying " "to use some MTIA's functionality without MTIA extension included."; -struct TORCH_API MTIAHooksInterface { - virtual ~MTIAHooksInterface() = default; +struct TORCH_API MTIAHooksInterface : AcceleratorHooksInterface { +// this fails the implementation if MTIAHooks functions are called, but +// MTIA backend is not present. +#define FAIL_MTIAHOOKS_FUNC(func) \ + TORCH_CHECK(false, "Cannot execute ", func, "() without MTIA backend."); + + virtual ~MTIAHooksInterface() override = default; virtual void initMTIA() const { - TORCH_CHECK( - false, - "Cannot initialize MTIA without MTIA Extension for PyTorch.", - MTIA_HELP); + // Avoid logging here, since MTIA needs init devices first then it will know + // how many devices are available. Make it as no-op if mtia extension is not + // dynamically loaded. + return; } virtual bool hasMTIA() const { return false; } + virtual DeviceIndex deviceCount() const override { + return 0; + } + + virtual void deviceSynchronize(c10::DeviceIndex device_index) const { + FAIL_MTIAHOOKS_FUNC(__func__); + } + virtual std::string showConfig() const { - TORCH_CHECK( - false, - "Cannot query detailed MTIA version without MTIA Extension for PyTorch.", - MTIA_HELP); + FAIL_MTIAHOOKS_FUNC(__func__); + } + + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { + return false; + } + + virtual void setCurrentDevice(DeviceIndex device) const override { + FAIL_MTIAHOOKS_FUNC(__func__); + } + + virtual DeviceIndex getCurrentDevice() const override { + FAIL_MTIAHOOKS_FUNC(__func__); + return -1; + } + + virtual DeviceIndex exchangeDevice(DeviceIndex device) const override { + FAIL_MTIAHOOKS_FUNC(__func__); + return -1; + } + + virtual DeviceIndex maybeExchangeDevice(DeviceIndex device) const override { + FAIL_MTIAHOOKS_FUNC(__func__); + return -1; + } + + virtual c10::Stream getCurrentStream(DeviceIndex device) const { + FAIL_MTIAHOOKS_FUNC(__func__); + return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA); + } + + virtual c10::Stream getDefaultStream(DeviceIndex device) const { + FAIL_MTIAHOOKS_FUNC(__func__); + return c10::Stream::unpack3(-1, 0, c10::DeviceType::MTIA); + } + + virtual void setCurrentStream(const c10::Stream& stream) const { + FAIL_MTIAHOOKS_FUNC(__func__); } }; @@ -47,5 +98,6 @@ C10_DECLARE_REGISTRY(MTIAHooksRegistry, MTIAHooksInterface, MTIAHooksArgs); namespace detail { TORCH_API const MTIAHooksInterface& getMTIAHooks(); +TORCH_API bool isMTIAHooksBuilt(); } // namespace detail } // namespace at diff --git a/aten/src/ATen/detail/ORTHooksInterface.cpp b/aten/src/ATen/detail/ORTHooksInterface.cpp deleted file mode 100644 index bbb69809e8770..0000000000000 --- a/aten/src/ATen/detail/ORTHooksInterface.cpp +++ /dev/null @@ -1,29 +0,0 @@ -#include - -#include -#include - -#include -#include - -namespace at { -namespace detail { - -// See getCUDAHooks for some more commentary -const ORTHooksInterface& getORTHooks() { - static std::unique_ptr ort_hooks; - static c10::once_flag once; - c10::call_once(once, [] { - ort_hooks = ORTHooksRegistry()->Create("ORTHooks", {}); - if (!ort_hooks) { - ort_hooks = std::make_unique(); - } - }); - return *ort_hooks; -} -} // namespace detail - -// NOLINTNEXTLINE(cppcoreguidelines-avoid-non-const-global-variables) -C10_DEFINE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs) - -} // namespace at diff --git a/aten/src/ATen/detail/ORTHooksInterface.h b/aten/src/ATen/detail/ORTHooksInterface.h deleted file mode 100644 index f49969ec66a5b..0000000000000 --- a/aten/src/ATen/detail/ORTHooksInterface.h +++ /dev/null @@ -1,36 +0,0 @@ -#pragma once - -#include -#include - -constexpr const char* ORT_HELP = - " You need to 'import torch_ort' to use the 'ort' device in PyTorch. " - "The 'torch_ort' module is provided by the ONNX Runtime itself " - "(https://onnxruntime.ai)."; - -// NB: Class must live in `at` due to limitations of Registry.h. -namespace at { - -struct TORCH_API ORTHooksInterface { - // This should never actually be implemented, but it is used to - // squelch -Werror=non-virtual-dtor - virtual ~ORTHooksInterface() = default; - - virtual std::string showConfig() const { - TORCH_CHECK(false, "Cannot query detailed ORT version information.", ORT_HELP); - } -}; - -// NB: dummy argument to suppress "ISO C++11 requires at least one argument -// for the "..." in a variadic macro" -struct TORCH_API ORTHooksArgs {}; - -TORCH_DECLARE_REGISTRY(ORTHooksRegistry, ORTHooksInterface, ORTHooksArgs); -#define REGISTER_ORT_HOOKS(clsname) \ - C10_REGISTER_CLASS(ORTHooksRegistry, clsname, clsname) - -namespace detail { -TORCH_API const ORTHooksInterface& getORTHooks(); -} // namespace detail - -} // namespace at diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp index 8c3861c617ccc..ff267a41506bb 100644 --- a/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp +++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.cpp @@ -22,4 +22,15 @@ TORCH_API bool isPrivateUse1HooksRegistered() { return privateuse1_hooks != nullptr; } +namespace detail { + +TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks() { + TORCH_CHECK( + privateuse1_hooks != nullptr, + "Please register PrivateUse1HooksInterface by `RegisterPrivateUse1HooksInterface` first."); + return *privateuse1_hooks; } + +} // namespace detail + +} // namespace at diff --git a/aten/src/ATen/detail/PrivateUse1HooksInterface.h b/aten/src/ATen/detail/PrivateUse1HooksInterface.h index 142e812d28375..0b1b028ab4021 100644 --- a/aten/src/ATen/detail/PrivateUse1HooksInterface.h +++ b/aten/src/ATen/detail/PrivateUse1HooksInterface.h @@ -1,13 +1,17 @@ #pragma once #include +#include +#include #include +#include #include namespace at { -struct TORCH_API PrivateUse1HooksInterface { - virtual ~PrivateUse1HooksInterface() = default; - virtual const at::Generator& getDefaultGenerator(c10::DeviceIndex device_index) { +struct TORCH_API PrivateUse1HooksInterface : AcceleratorHooksInterface { + virtual ~PrivateUse1HooksInterface() override = default; + virtual const at::Generator& getDefaultGenerator( + c10::DeviceIndex device_index) { TORCH_CHECK_NOT_IMPLEMENTED( false, "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDefaultGenerator`."); @@ -19,15 +23,39 @@ struct TORCH_API PrivateUse1HooksInterface { "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getDeviceFromPtr`."); } + virtual Allocator* getPinnedMemoryAllocator() const { + TORCH_CHECK( + false, + "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `getPinnedMemoryAllocator`."); + } + + virtual bool hasPrimaryContext(DeviceIndex device_index) const override { + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `hasPrimaryContext`."); + } + virtual void initPrivateUse1() const {} + virtual void resizePrivateUse1Bytes(const c10::Storage &storage, size_t newsize) const { + TORCH_CHECK_NOT_IMPLEMENTED( + false, + "You should register `PrivateUse1HooksInterface` for PrivateUse1 before call `resizePrivateUse1Bytes`."); + } }; struct TORCH_API PrivateUse1HooksArgs {}; -TORCH_API void RegisterPrivateUse1HooksInterface(at::PrivateUse1HooksInterface* hook_); +TORCH_API void RegisterPrivateUse1HooksInterface( + at::PrivateUse1HooksInterface* hook_); TORCH_API at::PrivateUse1HooksInterface* GetPrivateUse1HooksInterface(); TORCH_API bool isPrivateUse1HooksRegistered(); -} +namespace detail { + +TORCH_API const at::PrivateUse1HooksInterface& getPrivateUse1Hooks(); + +} // namespace detail + +} // namespace at diff --git a/aten/src/ATen/detail/XPUHooksInterface.h b/aten/src/ATen/detail/XPUHooksInterface.h index 9a3836dbcc78e..8e5e0d8243ab7 100644 --- a/aten/src/ATen/detail/XPUHooksInterface.h +++ b/aten/src/ATen/detail/XPUHooksInterface.h @@ -9,11 +9,6 @@ #include #include -// We use forward declaration here instead of #include to avoid -// leaking DLPack implementation detail to every project that includes `ATen/Context.h`, which in turn -// would lead to a conflict when linked with another project using DLPack (for example TVM) -struct DLDevice_; - namespace at { constexpr const char* XPU_HELP = @@ -44,23 +39,8 @@ struct TORCH_API XPUHooksInterface { XPU_HELP); } - virtual Device getATenDeviceFromDLPackDevice( - const DLDevice_& dl_device, - void* data) const { - TORCH_CHECK( - false, - "Cannot get XPU device without Intel Extension for Pytorch. ", - XPU_HELP); - } - - virtual DLDevice_& getDLPackDeviceFromATenDevice( - DLDevice_& dl_device, - const Device& aten_device, - void* data) const { - TORCH_CHECK( - false, - "Cannot get XPU DL device without Intel Extension for Pytorch. ", - XPU_HELP); + virtual int32_t getGlobalIdxFromDevice(const Device& device) const { + TORCH_CHECK(false, "Cannot get XPU global device index without ATen_xpu library."); } virtual Generator getXPUGenerator(C10_UNUSED DeviceIndex device_index = -1) const { @@ -71,9 +51,29 @@ struct TORCH_API XPUHooksInterface { TORCH_CHECK(false, "Cannot get default XPU generator without Intel Extension for Pytorch. ", XPU_HELP); } - virtual int getNumGPUs() const { + virtual DeviceIndex getNumGPUs() const { return 0; } + + virtual DeviceIndex current_device() const { + TORCH_CHECK(false, "Cannot get current device on XPU without ATen_xpu library."); + } + + virtual Device getDeviceFromPtr(void* /*data*/) const { + TORCH_CHECK(false, "Cannot get device of pointer on XPU without ATen_xpu library."); + } + + virtual void deviceSynchronize(DeviceIndex /*device_index*/) const { + TORCH_CHECK(false, "Cannot synchronize XPU device without ATen_xpu library."); + } + + virtual Allocator* getPinnedMemoryAllocator() const { + TORCH_CHECK(false, "Cannot get XPU pinned memory allocator without ATen_xpu library."); + } + + virtual bool isPinnedPtr(const void* /*data*/) const { + return false; + } }; struct TORCH_API XPUHooksArgs {}; diff --git a/aten/src/ATen/dlpack.h b/aten/src/ATen/dlpack.h index eb33058807718..9601a2478ddde 100644 --- a/aten/src/ATen/dlpack.h +++ b/aten/src/ATen/dlpack.h @@ -94,10 +94,7 @@ typedef enum { /*! * \brief A Device for Tensor and operator. */ -// NB: This is the only difference from -// https://github.com/dmlc/dlpack/blob/v0.7/include/dlpack/dlpack.h Required to -// allow forward declaration of DLDevice. -typedef struct DLDevice_ { +typedef struct { /*! \brief The device type used in the device. */ DLDeviceType device_type; /*! @@ -198,12 +195,12 @@ typedef struct { /*! \brief The data type of the pointer*/ DLDataType dtype; /*! \brief The shape of the tensor */ - int64_t* shape; + const int64_t* shape; /*! * \brief strides of the tensor (in number of elements, not bytes) * can be NULL, indicating tensor is compact and row-majored. */ - int64_t* strides; + const int64_t* strides; /*! \brief The offset in bytes to the beginning pointer to data */ uint64_t byte_offset; } DLTensor; diff --git a/aten/src/ATen/functorch/ADInterpreters.cpp b/aten/src/ATen/functorch/ADInterpreters.cpp index e113f5b01ad73..2f0de0b159b6a 100644 --- a/aten/src/ATen/functorch/ADInterpreters.cpp +++ b/aten/src/ATen/functorch/ADInterpreters.cpp @@ -3,7 +3,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { constexpr size_t default_bitset_size = 64; @@ -73,7 +73,7 @@ static void autogradBasedTransformProcess( return materializeGradWrappers(tensor, current_level); }; auto num_args = op.schema().arguments().size(); - foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), maybeTransformGradWrappers); + foreachTensorInplace(*stack, static_cast(stack->size() - num_args), static_cast(stack->size()), maybeTransformGradWrappers); setup_dispatch_key_tls(transform_type, {}); op.callBoxed(stack); @@ -133,7 +133,7 @@ static void autogradBasedTransformSendToNext( auto args_size = op.schema().arguments().size(); const auto ret_size = op.schema().returns().size(); // Step 1 - auto front = stack->size() - args_size; + auto front = static_cast(stack->size()) - args_size; for (const auto arg_idx : c10::irange(0, args_size)) { stack->push_back((*stack)[front + arg_idx]); } @@ -151,7 +151,7 @@ static void autogradBasedTransformSendToNext( // if the input is immutable, we find if it aliases anything, noting that // args are in reverse order on stack, so the last arg is at the top of the stack const auto relative_pos = idx - (stack->size() - args_size); - const auto aliased_out = findAliasedOutput(op.schema(), relative_pos); + const auto aliased_out = findAliasedOutput(op.schema(), static_cast(relative_pos)); if (aliased_out.has_value()) { outputs_aliasing_immutable.flip(*aliased_out); // each output aliases at most one input, so we can only hit this once } @@ -160,7 +160,7 @@ static void autogradBasedTransformSendToNext( } // Step 2 - foreachTensorInplace(*stack, stack->size() - args_size, stack->size(), unwrap); + foreachTensorInplace(*stack, static_cast(stack->size() - args_size), static_cast(stack->size()), unwrap); // See NOTE [grad and vjp interaction with no_grad] optional grad_guard; @@ -183,7 +183,7 @@ static void autogradBasedTransformSendToNext( op.callBoxed(stack); // Step 4 - foreachTensorInplaceWithFlag(*stack, stack->size() - ret_size, stack->size(), outputs_aliasing_immutable, wrap); + foreachTensorInplaceWithFlag(*stack, static_cast(stack->size() - ret_size), static_cast(stack->size()), outputs_aliasing_immutable, wrap); // Step 5 auto args_front = stack->size() - args_size - ret_size; @@ -200,7 +200,7 @@ static void autogradBasedTransformSendToNext( } // Step 6 - stack->erase(stack->end() - (args_size + ret_size), stack->end() - ret_size); + stack->erase(stack->end() - std::ptrdiff_t(args_size + ret_size), stack->end() - std::ptrdiff_t(ret_size)); } void GradInterpreterPtr::processImpl( @@ -239,4 +239,4 @@ void JvpInterpreterPtr::sendToNextInterpreterImpl( grad_special_case); } -}} // namespace at::functorch +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesActivation.cpp b/aten/src/ATen/functorch/BatchRulesActivation.cpp index b26ec74d84af7..87a7865b05054 100644 --- a/aten/src/ATen/functorch/BatchRulesActivation.cpp +++ b/aten/src/ATen/functorch/BatchRulesActivation.cpp @@ -10,7 +10,7 @@ // NB: most activation functions fit pointwise unary or binary rules. // These are only the ones that have special batch rules to help with organization -namespace at { namespace functorch { +namespace at::functorch { static std::tuple> glu_batch_rule(const Tensor& self, optional self_bdim, int64_t dim) { // repeated error message from glu because 0D -> 1D when batched @@ -53,4 +53,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT(glu_backward, glu_backward_batch_rule); VMAP_SUPPORT(glu, glu_batch_rule); } -}} // namespace at::functorch +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp index 1dd417052cf10..44ca2802bf3a2 100644 --- a/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesBinaryOps.cpp @@ -11,7 +11,7 @@ #include -namespace at { namespace functorch { +namespace at::functorch { template std::tuple> _binary_pointwise_batch_rule( @@ -60,13 +60,9 @@ struct BinaryRandomPointwiseBatchRuleHelper> { auto cur_level = maybe_layer->layerId(); RandomnessType randomness = maybe_layer->randomness(); - Tensor tensor_value; - optional tensor_bdim; - std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level); + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level); - Tensor other_value; - optional other_bdim; - std::tie(other_value, other_bdim) = unwrapTensorAtLevel(other, cur_level); + auto [other_value, other_bdim] = unwrapTensorAtLevel(other, cur_level); check_randomness(randomness, (tensor_bdim || other_bdim)); if (randomness == RandomnessType::Different && !tensor_bdim && !other_bdim) { @@ -520,4 +516,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT2(fill_, Tensor, fill__Tensor_batch_rule); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesConvolution.cpp b/aten/src/ATen/functorch/BatchRulesConvolution.cpp index c25c4972da25d..ca4eda19a36fb 100644 --- a/aten/src/ATen/functorch/BatchRulesConvolution.cpp +++ b/aten/src/ATen/functorch/BatchRulesConvolution.cpp @@ -8,7 +8,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { // convolution_batch_rule translated from jax with modifications: // https://github.com/google/jax/blob/master/jax/_src/lax/lax.py#L3143 @@ -29,7 +29,7 @@ convolution_batch_rule(const Tensor& lhs, optional lhs_bdim, const Tens // If we have a batched bias or weight, we need to perform the computation separately. optional unbatched_bias; - bool separate_bias; + bool separate_bias = false; if ((rhs_bdim && bias && bias->defined()) || bias_bdim) { TORCH_INTERNAL_ASSERT(bias.has_value()); TORCH_INTERNAL_ASSERT(bias->defined()); @@ -245,7 +245,7 @@ convolution_backward_input_batch_rule( const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, - c10::SymIntArrayRef output_padding, c10::SymInt groups) { + c10::SymIntArrayRef output_padding, const c10::SymInt& groups) { const std::array mask = {true, false, false}; if (grad_output_bdim && weight_bdim) { // regular: BNO, BOI -> N(BO), (BO)I -> N(BI) @@ -326,7 +326,7 @@ convolution_backward_weight_batch_rule( const Tensor& input, optional input_bdim, const Tensor& weight, optional weight_bdim, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, - c10::SymIntArrayRef output_padding, c10::SymInt groups) { + c10::SymIntArrayRef output_padding, const c10::SymInt& groups) { const std::array mask = {false, true, false}; if (grad_output_bdim && input_bdim) { // BNO, BNI -> N(BO), N(BI) -> (BO)I (regular) (BI)O (transposed) @@ -449,15 +449,9 @@ static std::tuple convolution_backward_plumbing( dilation, transposed, output_padding, groups, output_mask); } - Tensor grad_output; - optional grad_output_bdim; - std::tie(grad_output, grad_output_bdim) = unwrapTensorAtLevel(grad_output_, cur_level); - Tensor input; - optional input_bdim; - std::tie(input, input_bdim) = unwrapTensorAtLevel(input_, cur_level); - Tensor weight; - optional weight_bdim; - std::tie(weight, weight_bdim) = unwrapTensorAtLevel(weight_, cur_level); + auto [grad_output, grad_output_bdim] = unwrapTensorAtLevel(grad_output_, cur_level); + auto [input, input_bdim] = unwrapTensorAtLevel(input_, cur_level); + auto [weight, weight_bdim] = unwrapTensorAtLevel(weight_, cur_level); const auto grad_bias = compute_grad_bias(grad_output_, output_mask); output_mask[2] = false; @@ -542,4 +536,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("convolution_backward", convolution_backward_plumbing); } -}} // namespace at;:functorch +} // namespace at;:functorch diff --git a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp index 1b179a505e9a9..3e064d6c39dc7 100644 --- a/aten/src/ATen/functorch/BatchRulesDecompositions.cpp +++ b/aten/src/ATen/functorch/BatchRulesDecompositions.cpp @@ -13,7 +13,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { #define OP_DECOMPOSE(op) m.impl(#op, static_cast(native::op)); #define OP_DECOMPOSE2(op, overload) m.impl(#op"."#overload, static_cast(native::op)); @@ -226,6 +226,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) { m.impl("reshape", native::reshape_symint); OP_DECOMPOSE(resolve_conj); OP_DECOMPOSE(resolve_neg); + OP_DECOMPOSE(rms_norm); OP_DECOMPOSE(row_stack); OP_DECOMPOSE(rrelu); OP_DECOMPOSE(rrelu_); @@ -383,4 +384,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatchedDecomposition, m) { OP_DECOMPOSE2(to, other); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesDynamic.cpp b/aten/src/ATen/functorch/BatchRulesDynamic.cpp index b31d68a5768dd..e001e17f5d931 100644 --- a/aten/src/ATen/functorch/BatchRulesDynamic.cpp +++ b/aten/src/ATen/functorch/BatchRulesDynamic.cpp @@ -15,7 +15,7 @@ // errors for them. -namespace at { namespace functorch { +namespace at::functorch { namespace { void unsupportedDynamicOp(const c10::OperatorHandle& op, torch::jit::Stack* stack) { @@ -76,4 +76,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("allclose", torch::CppFunction::makeFromBoxedFunction<&unsupportedAllclose>()); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesFactory.cpp b/aten/src/ATen/functorch/BatchRulesFactory.cpp index 09430ce5f2483..f317fee6af6c7 100644 --- a/aten/src/ATen/functorch/BatchRulesFactory.cpp +++ b/aten/src/ATen/functorch/BatchRulesFactory.cpp @@ -7,7 +7,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { template struct NewBlahBatchRuleHelperSymInt; @@ -243,4 +243,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT(_new_zeros_with_same_feature_meta, _new_zeros_with_same_feature_meta_batch_rule); // Not sure how to add the ones with irregular args to the mix cleanly (i.e. randint takes an extra int parameter) } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesHelper.cpp b/aten/src/ATen/functorch/BatchRulesHelper.cpp index 89e2b4c5c772d..edac0ebde7914 100644 --- a/aten/src/ATen/functorch/BatchRulesHelper.cpp +++ b/aten/src/ATen/functorch/BatchRulesHelper.cpp @@ -7,7 +7,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { Tensor moveBatchDimToFront(const Tensor& tensor, optional maybe_batch_dim) { if (!maybe_batch_dim.has_value()) { @@ -118,11 +118,9 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) { // NOTE: 0 % 0 leads to FPE TORCH_INTERNAL_ASSERT(shape[src] % size1 == 0); } - int64_t size2; // split any size out of `0`-sized dim - if (shape[src] == 0) { - size2 = 0; - } else { + int64_t size2 = 0; + if (shape[src] != 0) { size2 = shape[src] / size1; } shape[src] = size1; @@ -130,7 +128,7 @@ Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x) { return at::reshape(x, shape); } -Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x) { +Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x) { src = maybe_wrap_dim(src, x.dim()); c10::SymDimVector shape(x.sym_sizes().begin(), x.sym_sizes().end()); if (shape[src] != 0) { @@ -204,4 +202,4 @@ std::tuple _binary_pointwise_helper( return std::make_tuple(tensor_, other_); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesHelper.h b/aten/src/ATen/functorch/BatchRulesHelper.h index 74217d8464d0a..9bb31e09ce4c5 100644 --- a/aten/src/ATen/functorch/BatchRulesHelper.h +++ b/aten/src/ATen/functorch/BatchRulesHelper.h @@ -28,7 +28,7 @@ namespace at::functorch { TORCH_API Tensor reshape_dim_into(int64_t src, int64_t dst, const Tensor& x); TORCH_API Tensor reshape_dim_outof(int64_t src, int64_t size1, const Tensor& x); -TORCH_API Tensor reshape_dim_outof_symint(int64_t src, c10::SymInt size1, const Tensor& x); +TORCH_API Tensor reshape_dim_outof_symint(int64_t src, const c10::SymInt& size1, const Tensor& x); Tensor moveBatchDimToFront(const Tensor& tensor, optional maybe_batch_dim); int64_t rankWithoutBatchDim(const Tensor& tensor, optional maybe_batch_dim); @@ -144,11 +144,9 @@ void boxed_tensor_inputs_batch_rule(const c10::OperatorHandle& op, torch::jit::S for (const auto idx : c10::irange(0, num_arguments)) { const auto& ivalue = arguments[idx]; if (ivalue.isTensor()) { - Tensor tensor_value; - optional tensor_bdim; - std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(ivalue.toTensor(), cur_level); + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(ivalue.toTensor(), cur_level); tensor_inputs.emplace_back(tensor_value, tensor_bdim); - tensor_pos.push_back(idx); + tensor_pos.push_back(static_cast(idx)); } } Func(tensor_inputs); @@ -214,7 +212,7 @@ inline void find_and_unpack_tensors( int64_t* batch_size) { int64_t computed_batch_size = -1; - int64_t args_begin = stack->size() - num_args; + int64_t args_begin = static_cast(stack->size()) - num_args; for (const auto idx : c10::irange(0, num_args)) { const auto& ivalue = (*stack)[args_begin + idx]; @@ -243,7 +241,7 @@ inline void boxed_existing_bdim_all_batch_rule( const c10::OperatorHandle& op, torch::jit::Stack* stack) { const auto& schema = op.schema(); const auto num_returns = schema.returns().size(); - const auto num_arguments = schema.arguments().size(); + const auto num_arguments = static_cast(schema.arguments().size()); c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); auto maybe_layer = maybeCurrentDynamicLayer(); @@ -256,10 +254,10 @@ inline void boxed_existing_bdim_all_batch_rule( return; } - int64_t args_begin = stack->size() - num_arguments; + int64_t args_begin = static_cast(stack->size()) - num_arguments; SmallVector tensor_inputs; SmallVector tensor_pos; - int64_t batch_size; + int64_t batch_size = 0; find_and_unpack_tensors( stack, num_arguments, cur_level, @@ -312,13 +310,13 @@ inline void boxed_all_tensors_have_optional_bdim( return; } - int64_t args_begin = stack->size() - num_arguments; + int64_t args_begin = static_cast(stack->size() - num_arguments); SmallVector tensor_inputs; SmallVector tensor_pos; - int64_t batch_size; + int64_t batch_size = 0; find_and_unpack_tensors( - stack, num_arguments, cur_level, + stack, static_cast(num_arguments), cur_level, &tensor_inputs, &tensor_pos, &batch_size); optional is_no_batch_dim_case; diff --git a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp index c3158214ba087..6a17adb4e268c 100644 --- a/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp +++ b/aten/src/ATen/functorch/BatchRulesLinearAlgebra.cpp @@ -6,7 +6,7 @@ #include -namespace at { namespace functorch { +namespace at::functorch { typedef std::tuple> oneOutput; typedef std::tuple, Tensor, optional> twoOutputs; @@ -265,6 +265,28 @@ static void expect_at_least_rank( rank, " dimensions instead."); } +threeOutputs linalg_lu_unpack_batch_rule( + const Tensor& LU, optional LU_bdim, + const Tensor& pivots, optional pivots_bdim, + bool unpack_data, bool unpack_pivots) { + auto LU_ = moveBatchDimToFront(LU, LU_bdim); + auto pivots_ = moveBatchDimToFront(pivots, pivots_bdim); + + // LU and pivots's first {N-2} (for LU), {N-1} (for pivots) dimensions must + // match So if only one of them is being vmapped over, we must expand out that + // dimension. + if (LU_bdim.has_value() != pivots_bdim.has_value()) { + auto bdim_size = get_bdim_size2(LU, LU_bdim, pivots, pivots_bdim); + LU_ = ensure_has_bdim(LU_, LU_bdim.has_value(), bdim_size); + pivots_ = ensure_has_bdim(pivots_, pivots_bdim.has_value(), bdim_size); + pivots_bdim = 0; + LU_bdim = 0; + } + + const auto res = at::lu_unpack(LU_, pivots_, unpack_data, unpack_pivots); + return std::make_tuple(std::get<0>(res), 0, std::get<1>(res), 0, std::get<2>(res), 0); +} + oneOutput linalg_lu_solve_batch_rule( const Tensor& LU, optional LU_bdim, const Tensor& pivots, optional pivots_bdim, @@ -348,7 +370,7 @@ fourOutputs solve_ex_batch_rule( TORCH_CHECK(A_logical_rank >= 2, "linalg.solve: The input tensor A must have at least 2 dimensions."); - int b_logical_rank = max_logical_rank; + auto b_logical_rank = max_logical_rank; if (A_logical_rank > B_logical_rank) { // vector case: B was a vector or batched vector // not accurate but matches linalg error message TORCH_CHECK(B_logical_rank >= 1, "linalg.solve: The input tensor B must have at least 2 dimensions."); @@ -417,8 +439,7 @@ fourOutputs linalg_lstsq_batch_rule( const auto self_ = ensure_has_bdim(std::get<0>(tensor_other), self_bdim.has_value(), batch_size); const auto b_ = ensure_has_bdim(std::get<1>(tensor_other), b_bdim.has_value(), batch_size); - Tensor res, res_1, res_2, res_3; - std::tie(res, res_1, res_2, res_3) = at::linalg_lstsq(self_, b_, rcond, driver); + auto [res, res_1, res_2, res_3] = at::linalg_lstsq(self_, b_, rcond, driver); // everything but the 0th output are only sometimes computed. When they aren't, they're empty tensors without a bdim const auto res_1_bdim = batch_dim_if_not_empty(res_1); @@ -553,6 +574,7 @@ pinv_batch_rule( } // These need to be outside. String constant must be declared outside of a macro to be used as template param +// NOLINTBEGIN(*array*) LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky, cholesky); LINALG_CHECK_MATRIX_UNARY_ONE_OUT(cholesky_inverse, cholesky_inverse); LINALG_CHECK_MATRIX_UNARY_TWO_OUT(linalg_cholesky_ex, linalg.cholesky); @@ -569,6 +591,7 @@ LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_det, linalg.det); LINALG_CHECK_MATRIX_UNARY_TWO_OUT(_linalg_eigh, linalg.eigh); LINALG_CHECK_MATRIX_UNARY_FOUR_OUT(_linalg_slogdet, linalg.slogdet); LINALG_CHECK_MATRIX_UNARY_THREE_OUT(_linalg_svd, linalg.svd); +// NOLINTEND(*array*) TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT(bmm, bmm_batch_rule); @@ -579,6 +602,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT(dot, dot_batch_rule); VMAP_SUPPORT(mv, mv_batch_rule); VMAP_SUPPORT(mm, mm_batch_rule); + VMAP_SUPPORT(lu_unpack, linalg_lu_unpack_batch_rule); VMAP_SUPPORT(linalg_lu_solve, linalg_lu_solve_batch_rule); VMAP_SUPPORT(linalg_householder_product, householder_product_batch_rule); VMAP_SUPPORT(cholesky_solve, cholesky_solve_batch_rule); // custom dim error @@ -593,4 +617,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("vdot", vdot_decomp); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesLoss.cpp b/aten/src/ATen/functorch/BatchRulesLoss.cpp index 470f18305f033..22f3adff95a01 100644 --- a/aten/src/ATen/functorch/BatchRulesLoss.cpp +++ b/aten/src/ATen/functorch/BatchRulesLoss.cpp @@ -9,7 +9,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { // Flattens out all dims except the batch dim, and also moves batch dim // (if it exists) to front. static at::Tensor flatten_logical(const Tensor& tensor, optional bdim) { @@ -98,12 +98,8 @@ static Tensor binary_cross_entropy_plumbing( return at::binary_cross_entropy(self, target, weight, reduction); } - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); - Tensor target_value; - optional target_bdim; - std::tie(target_value, target_bdim) = unwrapTensorAtLevel(target, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); + auto [target_value, target_bdim] = unwrapTensorAtLevel(target, cur_level); Tensor result; if (self_bdim || target_bdim) { @@ -137,16 +133,10 @@ static Tensor binary_cross_entropy_backward_plumbing( return at::binary_cross_entropy_backward(grad, input, target, weight_opt, reduction); } - Tensor grad_value; - optional grad_bdim; - std::tie(grad_value, grad_bdim) = unwrapTensorAtLevel( + auto [grad_value, grad_bdim] = unwrapTensorAtLevel( reduction == Reduction::None ? grad : grad.expand_as(input), cur_level); - Tensor input_value; - optional input_bdim; - std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level); - Tensor target_value; - optional target_bdim; - std::tie(target_value, target_bdim) = unwrapTensorAtLevel(target, cur_level); + auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level); + auto [target_value, target_bdim] = unwrapTensorAtLevel(target, cur_level); Tensor grad_input; if (grad_bdim || input_bdim || target_bdim) { @@ -190,4 +180,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("binary_cross_entropy_backward", binary_cross_entropy_backward_plumbing); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesModules.cpp b/aten/src/ATen/functorch/BatchRulesModules.cpp index 987c2edaabff1..875af39214453 100644 --- a/aten/src/ATen/functorch/BatchRulesModules.cpp +++ b/aten/src/ATen/functorch/BatchRulesModules.cpp @@ -10,7 +10,7 @@ #include -namespace at { namespace functorch { +namespace at::functorch { static Tensor getStepTensor(const Tensor& indices, const c10::SymInt& bdim_size, const c10::SymInt& num_embeddings) { // [batch_size, 1, 1, 1, ..., 1] @@ -218,16 +218,16 @@ cudnn_grid_sample_backward_batch_rule( // TODO: replace with targetable functionalization static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes) { TORCH_CHECK(self.dtype() == kLong, "one_hot is only applicable to index tensor."); - auto shape = self.sizes().vec(); + auto shape = self.sym_sizes().vec(); // empty tensor could be converted to one hot representation, // but shape inference is not possible. - if (self.numel() == 0) { + if (self.sym_numel() == 0) { if (num_classes <= 0) { AT_ERROR("Can not infer total number of classes from empty tensor."); } else { shape.push_back(num_classes); - return at::empty(shape, self.options()); + return at::empty_symint(shape, self.options()); } } @@ -247,7 +247,7 @@ static Tensor one_hot_decomposition_hack(const Tensor &self, int64_t num_classes // } shape.push_back(num_classes); - Tensor ret = at::zeros(shape, self.options()); + Tensor ret = at::zeros_symint(shape, self.options()); return ret.scatter(-1, self.unsqueeze(-1), 1); } @@ -402,4 +402,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("one_hot", one_hot_decomposition_hack); } -}} + +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesNorm.cpp b/aten/src/ATen/functorch/BatchRulesNorm.cpp index 42f68b731af45..faf39d8e374a3 100644 --- a/aten/src/ATen/functorch/BatchRulesNorm.cpp +++ b/aten/src/ATen/functorch/BatchRulesNorm.cpp @@ -9,7 +9,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { static bool is_empty_tensor(const Tensor& tensor) { const auto shape = tensor.sizes(); @@ -225,12 +225,8 @@ std::tuple batch_norm_backward_plumbing( vmap_check_escaped(maybe_layer, "batch_norm_backward_plumbing"); int64_t cur_level = maybe_layer->layerId(); - Tensor grad_out_value; - optional grad_out_bdim; - std::tie(grad_out_value, grad_out_bdim) = unwrapTensorAtLevel(grad_out, cur_level); - Tensor input_value; - optional input_bdim; - std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level); + auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level); + auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level); Tensor mean_value; optional weight_value; optional weight_bdim; @@ -247,12 +243,8 @@ std::tuple batch_norm_backward_plumbing( if (running_var.defined()) { std::tie(running_var_value, running_var_bdim) = unwrapTensorAtLevel(running_var, cur_level); } - Tensor save_mean_value; - optional save_mean_bdim; - std::tie(save_mean_value, save_mean_bdim) = unwrapTensorAtLevel(save_mean, cur_level); - Tensor save_rstd_value; - optional save_rstd_bdim; - std::tie(save_rstd_value, save_rstd_bdim) = unwrapTensorAtLevel(save_rstd, cur_level); + auto [save_mean_value, save_mean_bdim] = unwrapTensorAtLevel(save_mean, cur_level); + auto [save_rstd_value, save_rstd_bdim] = unwrapTensorAtLevel(save_rstd, cur_level); // results Tensor grad_bias; @@ -274,9 +266,7 @@ std::tuple batch_norm_backward_plumbing( if (output_mask[0]) { const auto grad_normalized_input = weight.defined() ? grad_out.transpose(0, 1) * padRight(weight, nullopt, grad_out.dim()) : grad_out.transpose(0, 1); // [B0, C, B, *] - Tensor grad_normalized_input_value; - optional grad_normalized_input_bdim; - std::tie(grad_normalized_input_value, grad_normalized_input_bdim) = + auto [grad_normalized_input_value, grad_normalized_input_bdim] = unwrapTensorAtLevel(grad_normalized_input.transpose(0, 1), cur_level); // [B0, B, C, *] c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); @@ -312,9 +302,7 @@ static std::tuple native_group_norm_plumbing( return at::native_group_norm(input, weight_opt, bias_opt, N, C, HxW, group, eps); } - Tensor input_value; - optional input_bdim; - std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level); + auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level); Tensor result0; Tensor mean; @@ -401,20 +389,14 @@ static std::tuple native_group_norm_backward_plumbing( return at::native_group_norm_backward(grad_out, input, mean, rstd, weight_opt, N, C, HxW, group, output_mask); } - Tensor input_value; - optional input_bdim; - std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level); + auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level); Tensor weight_value; optional weight_bdim; if (weight.defined()){ std::tie(weight_value, weight_bdim) = unwrapTensorAtLevel(weight, cur_level); } - Tensor mean_value; - optional mean_bdim; - std::tie(mean_value, mean_bdim) = unwrapTensorAtLevel(mean, cur_level); - Tensor rstd_value; - optional rstd_bdim; - std::tie(rstd_value, rstd_bdim) = unwrapTensorAtLevel(rstd, cur_level); + auto [mean_value, mean_bdim] = unwrapTensorAtLevel(mean, cur_level); + auto [rstd_value, rstd_bdim] = unwrapTensorAtLevel(rstd, cur_level); // results Tensor grad_input; @@ -436,9 +418,7 @@ static std::tuple native_group_norm_backward_plumbing( if (output_mask[0]) { const auto grad_normalized_input = weight.defined() ? grad_out * padRight(weight, nullopt, grad_out.dim() - 1) : grad_out; - Tensor grad_normalized_input_value; - optional grad_normalized_input_bdim; - std::tie(grad_normalized_input_value, grad_normalized_input_bdim) = + auto [grad_normalized_input_value, grad_normalized_input_bdim] = unwrapTensorAtLevel(grad_normalized_input, cur_level); c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); @@ -494,7 +474,7 @@ C10_ALWAYS_INLINE void _check_layer_norm_inputs( const Tensor& weight, optional weight_bdim, const Tensor& bias, optional bias_bdim) { - const int normalized_ndim = normalized_shape.size(); + const auto normalized_ndim = normalized_shape.size(); TORCH_CHECK( normalized_ndim >= 1, "Expected normalized_shape to be at least 1-dimensional, i.e., ", @@ -611,18 +591,10 @@ static std::tuple native_layer_norm_backward_p return at::native_layer_norm_backward(grad_out, input, normalized_shape, mean, rstd, weight_opt, bias_opt, output_mask); } - Tensor grad_out_value; - optional grad_out_bdim; - std::tie(grad_out_value, grad_out_bdim) = unwrapTensorAtLevel(grad_out, cur_level); - Tensor input_value; - optional input_bdim; - std::tie(input_value, input_bdim) = unwrapTensorAtLevel(input, cur_level); - Tensor mean_value; - optional mean_bdim; - std::tie(mean_value, mean_bdim) = unwrapTensorAtLevel(mean, cur_level); - Tensor rstd_value; - optional rstd_bdim; - std::tie(rstd_value, rstd_bdim) = unwrapTensorAtLevel(rstd, cur_level); + auto [grad_out_value, grad_out_bdim] = unwrapTensorAtLevel(grad_out, cur_level); + auto [input_value, input_bdim] = unwrapTensorAtLevel(input, cur_level); + auto [mean_value, mean_bdim] = unwrapTensorAtLevel(mean, cur_level); + auto [rstd_value, rstd_bdim] = unwrapTensorAtLevel(rstd, cur_level); optional weight_value; optional weight_bdim; if (weight.defined()) { @@ -644,7 +616,7 @@ static std::tuple native_layer_norm_backward_p if (num_front_dims_to_reduce == 0) { grad_bias = grad_out; } else { - grad_bias = grad_out.sum(range(0, num_front_dims_to_reduce)); + grad_bias = grad_out.sum(range(0, static_cast(num_front_dims_to_reduce))); } } if (output_mask[1] && weight_value.has_value()) { @@ -656,15 +628,13 @@ static std::tuple native_layer_norm_backward_p if (num_front_dims_to_reduce == 0) { grad_weight = expanded_grad_weight; } else { - grad_weight = expanded_grad_weight.sum(range(0, num_front_dims_to_reduce)); + grad_weight = expanded_grad_weight.sum(range(0, static_cast(num_front_dims_to_reduce))); } } if (output_mask[0]) { const auto grad_normalized_input = weight.defined() ? grad_out * weight : grad_out; - Tensor grad_normalized_input_value; - optional grad_normalized_input_bdim; - std::tie(grad_normalized_input_value, grad_normalized_input_bdim) = + auto [grad_normalized_input_value, grad_normalized_input_bdim] = unwrapTensorAtLevel(grad_normalized_input, cur_level); c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); @@ -906,4 +876,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("native_layer_norm_backward", native_layer_norm_backward_plumbing); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesPooling.cpp b/aten/src/ATen/functorch/BatchRulesPooling.cpp index b6ebb2e788089..68c25e6053d65 100644 --- a/aten/src/ATen/functorch/BatchRulesPooling.cpp +++ b/aten/src/ATen/functorch/BatchRulesPooling.cpp @@ -9,7 +9,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { template std::tuple,Tensor,optional> @@ -72,4 +72,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { ALL_TENSORS_HAVE_OPTIONAL_BDIM_BOXED_CONTIG1(4, max_pool3d_with_indices_backward, 2); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesRandomness.cpp b/aten/src/ATen/functorch/BatchRulesRandomness.cpp index 47cff54575cfd..79572f22ea3f6 100644 --- a/aten/src/ATen/functorch/BatchRulesRandomness.cpp +++ b/aten/src/ATen/functorch/BatchRulesRandomness.cpp @@ -16,8 +16,7 @@ // registered to FuncTorchVmapMode. This is because we need to interpose on // random operations even if they're not on a BatchedTensor. -namespace at { -namespace functorch { +namespace at::functorch { template Tensor random_batching_rule(SymIntArrayRef shape, ExtraArgs... extra_args) { @@ -40,9 +39,7 @@ Tensor& random_inplace_batching_rule(Tensor& self, ExtraArgs... extra_args) { c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchVmapMode); auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); self_value = moveBatchDimToFront(self_value, self_bdim); RandomnessType randomness = maybe_layer->randomness(); check_randomness(randomness); @@ -67,13 +64,9 @@ static Tensor& bernoulli_inplace_Tensor_batching_rule(Tensor& self, const Tensor auto cur_level = maybe_layer->layerId(); RandomnessType randomness = maybe_layer->randomness(); - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); - Tensor other_value; - optional other_bdim; - std::tie(other_value, other_bdim) = unwrapTensorAtLevel(p_, cur_level); + auto [other_value, other_bdim] = unwrapTensorAtLevel(p_, cur_level); check_randomness(randomness, other_bdim.has_value()); @@ -135,9 +128,7 @@ Tensor unary_pointwise_random_batch_rule(const Tensor& tensor, ExtraArgs... extr auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); - Tensor tensor_value; - optional tensor_bdim; - std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level); + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level); tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim); RandomnessType randomness = maybe_layer->randomness(); @@ -165,9 +156,7 @@ Tensor tensor_like_random_batch_rule(const Tensor& self, ExtraArgs... extra_args RandomnessType randomness = maybe_layer->randomness(); check_randomness(randomness); - Tensor tensor_value; - optional tensor_bdim; - std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(self, cur_level); tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim); if (randomness == RandomnessType::Same && tensor_bdim) { @@ -190,9 +179,7 @@ static std::tuple native_dropout_batching_rule(const Tensor& tens const auto cur_level = maybe_layer->layerId(); RandomnessType randomness = maybe_layer->randomness(); - Tensor tensor_value; - optional tensor_bdim; - std::tie(tensor_value, tensor_bdim) = unwrapTensorAtLevel(tensor, cur_level); + auto [tensor_value, tensor_bdim] = unwrapTensorAtLevel(tensor, cur_level); tensor_value = moveBatchDimToFront(tensor_value, tensor_bdim); if (!train.has_value() || train) { @@ -212,8 +199,8 @@ static std::tuple native_dropout_batching_rule(const Tensor& tens } auto [output, mask] = at::native_dropout(tensor_value, p, train); return std::make_tuple( - makeBatched(std::move(output), 0, cur_level), - makeBatched(std::move(mask), 0, cur_level)); + makeBatched(output, 0, cur_level), + makeBatched(mask, 0, cur_level)); } // repeated code from the CPU kernel since the CUDA one doesn't call bernoulli_ explicitly @@ -231,9 +218,7 @@ static Tensor multinomial_batching_rule(const Tensor& self, const int64_t num_sa auto maybe_layer = maybeCurrentDynamicLayer(); const auto cur_level = maybe_layer->layerId(); - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); self_value = moveBatchDimToFront(self_value, self_bdim); RandomnessType randomness = maybe_layer->randomness(); @@ -279,7 +264,7 @@ struct RandomBatchRuleHelper> { template Tensor rand_int_wrapper(SymIntArrayRef shape, c10::SymInt high, T... extra_args) { - return Func(high, std::move(shape), std::forward(extra_args)...); + return Func(high, shape, std::forward(extra_args)...); } template @@ -505,4 +490,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) { #undef UNARY_POINTWISE_RANDOM_LEADING_FLOAT #undef TENSOR_LIKE_COMMON_ARG_TYPES } -}} // namespace at::functorch + +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp index 62afe2699f395..cb6d6ac519dd8 100644 --- a/aten/src/ATen/functorch/BatchRulesReduceOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesReduceOps.cpp @@ -11,7 +11,7 @@ #include -namespace at { namespace functorch { +namespace at::functorch { static bool is_allowed_dim_on_scalar_tensor(int64_t dim) { return dim == 0 || dim == -1; @@ -75,7 +75,7 @@ static Tensor any_decomp(const Tensor& self) { return at::any(self.flatten(), 0, false); } -enum ReductionCase { DimArray, Dim }; +enum class ReductionCase:uint8_t { DimArray, Dim }; // Macros and templates have a difficult time dealing with enums, // so we didn't turn this into an enum. @@ -115,7 +115,7 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack auto orig_arguments = torch::jit::last(*stack, num_arguments); if (std::none_of(orig_arguments.begin(), orig_arguments.end(), ivalueParticipatesInCurrentLevel)) { - c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + c10::impl::ExcludeDispatchKeyGuard guard_2(DispatchKey::FuncTorchBatched); op.callBoxed(stack); return; } @@ -123,15 +123,13 @@ void boxed_reduction_batch_rule(const c10::OperatorHandle& op, torch::jit::Stack auto arguments = torch::jit::pop(*stack, num_arguments); TORCH_INTERNAL_ASSERT(arguments[0].isTensor()); - Tensor self; - optional self_bdim; - std::tie(self, self_bdim) = unwrapTensorAtLevel(arguments[0].toTensor(), cur_level); + auto [self, self_bdim] = unwrapTensorAtLevel(arguments[0].toTensor(), cur_level); self = moveBatchDimToFront(self, self_bdim); auto logical_dim = rankWithoutBatchDim(self, self_bdim); std::vector dims; - ReductionCase reduction_case; + ReductionCase reduction_case{}; if (arguments[dim_arg_pos].isIntList()) { reduction_case = ReductionCase::DimArray; dims = arguments[dim_arg_pos].toIntList().vec(); @@ -509,4 +507,5 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT(_is_all_true, _is_all_true_batch_rule); VMAP_SUPPORT(_is_any_true, _is_any_true_batch_rule); } -}} + +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp index d253fa0047db6..0a1475497b03d 100644 --- a/aten/src/ATen/functorch/BatchRulesScatterOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesScatterOps.cpp @@ -11,11 +11,10 @@ #include #include #include -#include #include -namespace at { namespace functorch { +namespace at::functorch { namespace { static bool any_has_value(ArrayRef> bdims) { @@ -326,9 +325,7 @@ Tensor index_plumbing(const Tensor & self, const List> & indice if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level)) { return at::index(self, indices); } - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); std::vector> indices_value; std::vector> indices_bdims; for (const auto&& indRef : indices) { @@ -458,9 +455,7 @@ namespace { const List> &indices, const Tensor &values, int64_t cur_level) { - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); std::vector> indices_value; std::vector> indices_bdims; for (const auto &&indRef : indices) @@ -468,16 +463,13 @@ namespace { optional ind = indRef; optional index; optional index_bdim; - if (ind.has_value()) - { + if (ind.has_value()) { std::tie(index, index_bdim) = unwrapTensorAtLevel(ind.value(), cur_level); } indices_value.push_back(index); indices_bdims.push_back(index_bdim); } - Tensor values_value; - optional values_bdim; - std::tie(values_value, values_bdim) = unwrapTensorAtLevel(values, cur_level); + auto [values_value, values_bdim] = unwrapTensorAtLevel(values, cur_level); return std::make_tuple(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim); } @@ -494,9 +486,7 @@ void index_put__batch_rule( if (!self_bdim.has_value()) { vmapIncompatibleInplaceError("index_put_"); } - Tensor self_, values_; - std::vector> indices_; - std::tie(self_, indices_, values_) = index_put_batch_rule_helper( + auto [self_, indices_, values_] = index_put_batch_rule_helper( self, self_bdim, indices, indices_bdims, values, values_bdim); at::index_put_(self_, List>(indices_), values_, accumulate); } @@ -511,11 +501,7 @@ Tensor& index_put__plumbing(Tensor & self, const List> & indice if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) { return self.index_put_(indices, values, accumulate); } - Tensor self_value, values_value; - optional self_bdim, values_bdim; - std::vector> indices_value; - std::vector> indices_bdims; - std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) = + auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] = unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level); index_put__batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate); return self; @@ -533,9 +519,7 @@ void _index_put_impl__batch_rule( if (!self_bdim.has_value()) { vmapIncompatibleInplaceError("_index_put_impl_"); } - Tensor self_, values_; - std::vector> indices_; - std::tie(self_, indices_, values_) = index_put_batch_rule_helper( + auto [self_, indices_, values_] = index_put_batch_rule_helper( self, self_bdim, indices, indices_bdims, values, values_bdim); at::_index_put_impl_(self_, List>(indices_), values_, accumulate, unsafe); } @@ -550,11 +534,7 @@ Tensor &_index_put_impl__plumbing(Tensor &self, const List> &in if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) { return at::_index_put_impl_(self, indices, values, accumulate, unsafe); } - Tensor self_value, values_value; - optional self_bdim, values_bdim; - std::vector> indices_value; - std::vector> indices_bdims; - std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) = + auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] = unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level); _index_put_impl__batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate, unsafe); return self; @@ -639,9 +619,7 @@ std::tuple> index_put_batch_rule( } } - Tensor self_, values_; - std::vector> indices_; - std::tie(self_, indices_, values_) = index_put_batch_rule_helper( + auto [self_, indices_, values_] = index_put_batch_rule_helper( self, self_bdim, indices, indices_bdims, values, values_bdim, batch_size); // Why do we need to permute values? @@ -670,11 +648,7 @@ Tensor index_put_plumbing(const Tensor & self, const List> & in if (!isBatchedAtLevel(self, cur_level) && !isBatchedAtLevel(indices, cur_level) && !isBatchedAtLevel(values, cur_level)) { return self.index_put(indices, values, accumulate); } - Tensor self_value, values_value; - optional self_bdim, values_bdim; - std::vector> indices_value; - std::vector> indices_bdims; - std::tie(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim) = + auto [self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim] = unpackSelfAndIndicesAndValuesAtCurrentLevel(self, indices, values, cur_level); auto results = index_put_batch_rule(self_value, self_bdim, indices_value, indices_bdims, values_value, values_bdim, accumulate); return makeBatched(std::get<0>(results), std::get<1>(results), cur_level); @@ -835,7 +809,7 @@ Tensor get_expanded_index(const Tensor& index, IntArrayRef self_size, int64_t di if (index.dim() == 0) { return index.expand(self_size); } - dim = maybe_wrap_dim(dim, self_size.size()); + dim = maybe_wrap_dim(dim, static_cast(self_size.size())); // setup new_index_shape as [BS, 1, ..., idx_size, ..., 1] // to reshape index_ @@ -1270,4 +1244,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { m.impl("as_strided_scatter", torch::CppFunction::makeFromBoxedFunction<&vmapErrorFallback>()); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp index fbf058addfbf6..f44000674db8a 100644 --- a/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp +++ b/aten/src/ATen/functorch/BatchRulesUnaryOps.cpp @@ -7,7 +7,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { namespace{ std::tuple> @@ -52,7 +52,7 @@ std::tuple> view_as_complex_batch_rule(const Tensor& self, optional self_bdim) { // guard against the user passing in a batch of scalar tensors with batch // size equal to 2. - TORCH_CHECK(self.sizes().size() > 1, "Input tensor must have one or more dimensions"); + TORCH_CHECK(self.sym_sizes().size() > 1, "Input tensor must have one or more dimensions"); auto self_ = moveBatchDimToFront(self, self_bdim); auto result = at::view_as_complex(self_); @@ -185,4 +185,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { } #undef INVOKE -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchRulesViews.cpp b/aten/src/ATen/functorch/BatchRulesViews.cpp index 345163cd5efdf..81e9d5b9aa21c 100644 --- a/aten/src/ATen/functorch/BatchRulesViews.cpp +++ b/aten/src/ATen/functorch/BatchRulesViews.cpp @@ -5,7 +5,6 @@ // LICENSE file in the root directory of this source tree. #include -#include #include #include @@ -17,7 +16,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { // Note [Adding vmap support for an operator] // Hey there! So you have an operator and you want to get it to work with vmap. @@ -163,9 +162,7 @@ const Tensor& resize__plumbing( return self.resize_(size, optional_memory_format); } - Tensor self_value; - optional self_bdim; - std::tie(self_value, self_bdim) = unwrapTensorAtLevel(self, cur_level); + auto [self_value, self_bdim] = unwrapTensorAtLevel(self, cur_level); TORCH_INTERNAL_ASSERT(self_bdim.has_value()); // TODO: The following algorithm only works for batch dim == 0. @@ -204,7 +201,7 @@ std::tuple> squeeze_batch_rule(const Tensor& self, opt int64_t new_batch_idx = 0; int64_t original_idx = 0; - for (auto it : shape) { + for (const auto& it : shape) { // Keep only dimensions != 1 and the batch dimension (irrespective of size). if (it != 1 || original_idx == bdim) { squeezed_sizes.push_back(it); @@ -294,7 +291,7 @@ std::tuple> roll_batch_rule(const Tensor& self, option return std::make_tuple(at::roll_symint(self_, shifts, new_dims), 0); } // We will do something like: t.reshape(a, -1).roll(1, dims=[1, ]).reshape(old_shape) - auto old_shape = self_.sizes(); + auto old_shape = self_.sym_sizes(); new_dims.push_back(1); auto logical_rank = rankWithoutBatchDim(self, bdim); if (logical_rank == 0) { @@ -304,7 +301,7 @@ std::tuple> roll_batch_rule(const Tensor& self, option auto output = at::roll_symint(self_.flatten(1), shifts, new_dims); // NOTE: For scalar tensor, we don't need to unsqueeze as reshape // with `old_shape` takes care of it. - output = output.reshape(old_shape); + output = output.reshape_symint(old_shape); return std::make_tuple(output, 0); } @@ -454,7 +451,7 @@ std::tuple> expand_batch_rule( auto self_ = moveBatchDimToFront(self, self_bdim); auto self_sizes = self_.sym_sizes(); - auto batch_size = self_sizes[0]; + const auto& batch_size = self_sizes[0]; c10::SmallVector size_(size.size() + 1); size_[0] = batch_size; @@ -589,4 +586,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { VMAP_SUPPORT2(unsafe_split, Tensor, unsafe_split_batch_rule); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchedFallback.cpp b/aten/src/ATen/functorch/BatchedFallback.cpp index 8014933056e7e..ef24406846c6c 100644 --- a/aten/src/ATen/functorch/BatchedFallback.cpp +++ b/aten/src/ATen/functorch/BatchedFallback.cpp @@ -17,8 +17,7 @@ #include #include -namespace at { -namespace functorch { +namespace at::functorch { bool kVmapFallbackWarningEnabled = true; @@ -160,7 +159,7 @@ static void batchedTensorInplaceForLoopFallback(const c10::OperatorHandle& op, t "please file a bug report instead."); } batched_tensor_inputs.push_back(tensor); - batched_tensor_inputs_position.push_back(idx); + batched_tensor_inputs_position.push_back(static_cast(idx)); } TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty()); @@ -305,7 +304,7 @@ void batchedTensorForLoopFallback(const c10::OperatorHandle& op, torch::jit::Sta continue; } batched_tensor_inputs.push_back(tensor); - batched_tensor_inputs_position.push_back(idx); + batched_tensor_inputs_position.push_back(static_cast(idx)); } TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty()); @@ -446,18 +445,18 @@ void batchedNestedTensorForLoopFallback(const c10::OperatorHandle& op, torch::ji continue; } batched_tensor_inputs.push_back(tensor); - batched_tensor_inputs_position.push_back(idx); + batched_tensor_inputs_position.push_back(static_cast(idx)); } TORCH_INTERNAL_ASSERT(!batched_tensor_inputs.empty()); std::vector> unbound; - for (auto iter = batched_tensor_inputs.begin(); iter != batched_tensor_inputs.end(); ++iter) { - auto *batched_impl = maybeGetBatchedImpl(*iter); + for (auto const &batched_tensor_input: batched_tensor_inputs) { + auto *batched_impl = maybeGetBatchedImpl(batched_tensor_input); TORCH_INTERNAL_ASSERT(batched_impl->value().is_nested() || batched_impl->bdim() == 0, "Fallback not supported for mixed nested / non-nested arguments without bdim=0"); c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor); auto this_unbound = batched_impl->value().unbind(); - if (unbound.size() > 0) { + if (!unbound.empty()) { TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(), "Fallback not supported for differently-sized nested arguments"); } @@ -514,5 +513,4 @@ void vmapErrorFallback(const c10::OperatorHandle& op, torch::jit::Stack* stack) TORCH_CHECK(false, "Error: ", op.operator_name(), " requires special handling, and does not yet have a batching rule. Feel free to file a github issue!"); } -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.cpp b/aten/src/ATen/functorch/BatchedTensorImpl.cpp index 1af054a2eba46..7eae8303d2af6 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.cpp +++ b/aten/src/ATen/functorch/BatchedTensorImpl.cpp @@ -10,8 +10,7 @@ #include -namespace at { -namespace functorch { +namespace at::functorch { BatchedTensorImpl::BatchedTensorImpl(DispatchKeySet key_set, Tensor value, int64_t bdim, int64_t level) : TensorImpl( @@ -71,7 +70,7 @@ void BatchedTensorImpl::refreshTensorMetadata() { int64_t BatchedTensorImpl::actualDim(int64_t dim, bool wrap_dim) const { if (wrap_dim) { const auto ndim = sizes_and_strides_.size(); - dim = maybe_wrap_dim(dim, ndim); + dim = maybe_wrap_dim(dim, static_cast(ndim)); } if (bdim_ <= dim) { return dim + 1; @@ -161,6 +160,7 @@ c10::intrusive_ptr BatchedTensorImpl::shallow_copy_and_detach( } c10::intrusive_ptr BatchedTensorImpl::shallow_copy_and_detach( + // NOLINTNEXTLINE(cppcoreguidelines-rvalue-reference-param-not-moved) c10::VariableVersion&& version_counter, bool allow_tensor_metadata_change) const { TORCH_CHECK(false, "accessing `data` under vmap transform is not allowed"); @@ -185,5 +185,4 @@ Tensor addBatchDim(const Tensor& tensor, int64_t dim, int64_t level) { return makeBatched(tensor, dim, level); } -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/BatchedTensorImpl.h b/aten/src/ATen/functorch/BatchedTensorImpl.h index d29f3f6d6a370..f3754e3c30816 100644 --- a/aten/src/ATen/functorch/BatchedTensorImpl.h +++ b/aten/src/ATen/functorch/BatchedTensorImpl.h @@ -7,7 +7,6 @@ #pragma once #include -#include #include #include @@ -119,15 +118,15 @@ inline bool isBatchedTensor(const Tensor& tensor) { // It is unsafe to call this on a Tensor that is not backed by a // BatchedTensorImpl. Please use `maybeGetBatchedImpl` whenever possible. -inline BatchedTensorImpl* unsafeGetBatchedImpl(Tensor tensor) { +inline BatchedTensorImpl* unsafeGetBatchedImpl(const Tensor& tensor) { return static_cast(tensor.unsafeGetTensorImpl()); } -inline BatchedTensorImpl* maybeGetBatchedImpl(Tensor tensor) { +inline BatchedTensorImpl* maybeGetBatchedImpl(const Tensor& tensor) { if (!isBatchedTensor(tensor)) { return nullptr; } - return unsafeGetBatchedImpl(std::move(tensor)); + return unsafeGetBatchedImpl(tensor); } // Returns a bitset. If bit i is set, then that means dim i is a batchdim. diff --git a/aten/src/ATen/functorch/DynamicLayer.cpp b/aten/src/ATen/functorch/DynamicLayer.cpp index 2d271a613340a..45976fa855f32 100644 --- a/aten/src/ATen/functorch/DynamicLayer.cpp +++ b/aten/src/ATen/functorch/DynamicLayer.cpp @@ -17,8 +17,7 @@ #include #include -namespace at { -namespace functorch { +namespace at::functorch { void setDynamicLayerFrontBackKeysIncluded(bool included) { c10::impl::tls_set_dispatch_key_included(DispatchKey::FuncTorchDynamicLayerFrontMode, included); @@ -235,7 +234,7 @@ int64_t pushDynamicLayer(DynamicLayer&& dynamic_layer) { auto& dynamicLayerStack = dynamicLayerStackAccessor(); int64_t layerId = 1 + dynamicLayerStack.size(); TORCH_INTERNAL_ASSERT(layerId == dynamic_layer.layerId()); - dynamicLayerStack.emplace_back(dynamic_layer); + dynamicLayerStack.emplace_back(std::move(dynamic_layer)); if (layerId == 1) { setDynamicLayerFrontBackKeysIncluded(true); @@ -258,7 +257,7 @@ int64_t initAndPushDynamicLayer( optional functionalize_add_back_views) { const auto& dynamicLayerStack = dynamicLayerStackAccessor(); const auto layerId = 1 + dynamicLayerStack.size(); - DynamicLayer new_layer(transform_type, layerId, batch_size, randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views); + DynamicLayer new_layer(transform_type, layerId, std::move(batch_size), randomness, prev_grad_mode, prev_fwd_grad_mode, functionalize_add_back_views); // NB: this function should be called while holding the GIL to avoid races new_layer.interpreter().set_is_alive(true); pushDynamicLayer(std::move(new_layer)); @@ -307,7 +306,7 @@ void foreachTensorInplace(std::vector& args, int64_t begin, int64_t end, } void foreachTensorInplaceWithFlag(std::vector& args, int64_t begin, int64_t end, - const std::bitset<64> use_flag_relative, std::function func){ + const std::bitset<64> use_flag_relative, const std::function& func){ TORCH_INTERNAL_ASSERT(begin >= 0); TORCH_INTERNAL_ASSERT(end >= 0); TORCH_INTERNAL_ASSERT(begin <= end); @@ -511,5 +510,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchDynamicLayerBackMode, m) { SPECIAL_GRAD_CASE(alias); } -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp index 717eb87ae1f31..89175cc79c5ec 100644 --- a/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp +++ b/aten/src/ATen/functorch/FunctionalizeInterpreter.cpp @@ -2,7 +2,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { static void sanityCheckNotFunctional(const c10::OperatorHandle& op, torch::jit::Stack* stack, size_t num_args) { foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), @@ -64,4 +64,4 @@ void FunctionalizeInterpreterPtr::sendToNextInterpreterImpl( sanityCheckNotFunctional(op, stack, ret_size); } -}} // namespace at::functorch +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/Interpreter.cpp b/aten/src/ATen/functorch/Interpreter.cpp index 8d672c4128280..609cda8562953 100644 --- a/aten/src/ATen/functorch/Interpreter.cpp +++ b/aten/src/ATen/functorch/Interpreter.cpp @@ -6,9 +6,7 @@ #include #include -#include - -namespace at { namespace functorch { +namespace at::functorch { static DispatchKeySet get_all_dynlayer_keyset() { // NB: FULL_AFTER does not include the dispatch key @@ -92,12 +90,12 @@ std::ostream& operator<<(std::ostream& os, const TransformType& t) { void sanityCheckStack(const c10::OperatorHandle& op, torch::jit::Stack* stack) { auto num_args = op.schema().arguments().size(); - foreachTensorInplace(*stack, stack->size() - num_args, stack->size(), + foreachTensorInplace(*stack, static_cast(stack->size() - num_args), static_cast(stack->size()), [](const Tensor& tensor) { auto result = unwrapIfDead(tensor); auto* wrapper = maybeGetTensorWrapper(result); TORCH_INTERNAL_ASSERT(wrapper == nullptr); - auto* batched = maybeGetBatchedImpl(std::move(result)); + auto* batched = maybeGetBatchedImpl(result); TORCH_INTERNAL_ASSERT(batched == nullptr); return tensor; }); @@ -129,4 +127,4 @@ void Interpreter::sendToNextInterpreter(const c10::OperatorHandle& op, torch::ji INTERPRETER_DISPATCH(key_, SINGLE_ARG(sendToNextInterpreterImpl(op, stack, grad_special_case))); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/Interpreter.h b/aten/src/ATen/functorch/Interpreter.h index 81190ffde1686..c08882fc30137 100644 --- a/aten/src/ATen/functorch/Interpreter.h +++ b/aten/src/ATen/functorch/Interpreter.h @@ -5,6 +5,7 @@ #include #include #include +#include #include namespace at::functorch { @@ -144,7 +145,7 @@ struct Interpreter { void saveLocalDispatchKeySet(c10::impl::LocalDispatchKeySet keyset) { TORCH_INTERNAL_ASSERT(!savedLocalDispatchKeySet_.has_value()); - savedLocalDispatchKeySet_ = std::move(keyset); + savedLocalDispatchKeySet_ = keyset; } void clearSavedLocalDispatchKeySet() { TORCH_INTERNAL_ASSERT(savedLocalDispatchKeySet_.has_value()); @@ -173,11 +174,11 @@ struct Interpreter { private: explicit Interpreter(TransformType type, int64_t level, InterpreterMeta meta): - type_(type), level_(level), is_alive_(std::make_shared(false)), meta_(meta) {} + type_(type), level_(level), is_alive_(std::make_shared(false)), meta_(std::move(meta)) {} // fields - TransformType type_; - int64_t level_; + TransformType type_{}; + int64_t level_{}; optional savedLocalDispatchKeySet_; std::shared_ptr is_alive_; InterpreterMeta meta_; @@ -195,7 +196,7 @@ void foreachTensorInplace(std::vector& args, int64_t begin, int64_t end, // args[i] = func(args[i], i - begin, true) // args[i] = func(args[i], i - begin) void foreachTensorInplaceWithFlag(std::vector& args, int64_t begin, int64_t end, - const std::bitset<64> use_flag_relative, std::function func); + const std::bitset<64> use_flag_relative, const std::function& func); std::vector findUnwrappedInputs(std::vector& args, int64_t begin, int64_t end); diff --git a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp index 5dd569bea1e36..b7a131766ec86 100644 --- a/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp +++ b/aten/src/ATen/functorch/LegacyBatchingRegistrations.cpp @@ -19,8 +19,7 @@ #include -namespace at { -namespace functorch { +namespace at::functorch { // NOTE: [What is a batching rule?] @@ -259,6 +258,18 @@ std::vector split_with_sizes_batching_rule(const Tensor& self, SymIntArr return result; } +std::vector split_with_sizes_copy_batching_rule(const Tensor& self, SymIntArrayRef split_sizes, int64_t dim) { + if (!participatesInCurrentLevel(self)) { + c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); + return split_with_sizes_copy_symint(self, split_sizes, dim); + } + auto self_physical = MultiBatchVmapTransform::logicalToPhysical(self); + auto dim_physical = self_physical.getPhysicalDim(dim); + auto result = split_with_sizes_copy_symint(self_physical.tensor(), split_sizes, dim_physical); + self_physical.getPhysicalToLogicalMap().applyInplace(result); + return result; +} + std::vector unbind_batching_rule(const Tensor& self, int64_t dim) { if (!participatesInCurrentLevel(self)) { c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::FuncTorchBatched); @@ -275,7 +286,7 @@ std::vector unbind_batching_rule(const Tensor& self, int64_t dim) { // can be indexed (or nullopt if such a location doesn't exist, e.g., tensors // with zero-size dims). static optional maximum_indexable_location( - c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, c10::SymInt storage_offset) { + c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, const c10::SymInt& storage_offset) { auto result = native::storage_size_for(sizes, strides); if (result == 0) { return nullopt; @@ -292,7 +303,7 @@ static void checkBasicAsStridedValidForSlice( int64_t num_batch_dims, c10::SymIntArrayRef sizes, c10::SymIntArrayRef strides, - optional maybe_storage_offset) { + const optional& maybe_storage_offset) { auto slice_sizes = physical_tensor.sym_sizes().slice(num_batch_dims); auto slice_strides = physical_tensor.sym_strides().slice(num_batch_dims); auto base_offset = physical_tensor.sym_storage_offset(); @@ -682,17 +693,17 @@ Tensor new_empty_strided_batching_rule( } Tensor nested_cat_batching_rule(const ITensorListRef& tensors, int64_t dim) { - TORCH_CHECK(tensors.size() > 0, "cat() not supported on empty tensor list"); + TORCH_CHECK(!tensors.empty(), "cat() not supported on empty tensor list"); std::vector> unbound; - for (auto tensor_iter = tensors.begin(); tensor_iter != tensors.end(); ++tensor_iter) { - auto* maybe_batched_impl = maybeGetBatchedImpl(*tensor_iter); + for (const auto & tensor : tensors) { + auto* maybe_batched_impl = maybeGetBatchedImpl(tensor); TORCH_CHECK(maybe_batched_impl, "Tried to run batching rule for cat() on a non-batched tensor"); auto nt = maybe_batched_impl->value(); TORCH_CHECK(nt.is_nested(), "Tried to run batching rule for cat() on a non-nested tensor"); c10::impl::ExcludeDispatchKeyGuard guard(DispatchKey::BatchedNestedTensor); auto this_unbound = nt.unbind(); - if (unbound.size() > 0) { + if (!unbound.empty()) { TORCH_INTERNAL_ASSERT(unbound.front().size() == this_unbound.size(), "cat() not supported for differently-sized nested arguments"); } @@ -725,6 +736,7 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchBatched, m) { // still legacy b/c teturns multiple tensors m.impl("split.Tensor", split_batching_rule); m.impl("split_with_sizes", split_with_sizes_batching_rule); + m.impl("split_with_sizes_copy", split_with_sizes_copy_batching_rule); m.impl("unbind.int", unbind_batching_rule); m.impl("cat", cat_batching_rule); m.impl("block_diag", block_diag_batching_rule); @@ -751,5 +763,5 @@ TORCH_LIBRARY_IMPL(_, BatchedNestedTensor, m) { TORCH_LIBRARY_IMPL(aten, BatchedNestedTensor, m) { m.impl("cat", nested_cat_batching_rule); } -} // namespace functorch -} // namespace at + +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/LegacyVmapTransforms.cpp b/aten/src/ATen/functorch/LegacyVmapTransforms.cpp index 682169a52622d..e7242cc8f07fc 100644 --- a/aten/src/ATen/functorch/LegacyVmapTransforms.cpp +++ b/aten/src/ATen/functorch/LegacyVmapTransforms.cpp @@ -10,8 +10,7 @@ #include #include -namespace at { -namespace functorch { +namespace at::functorch { // Takes a BatchedTensorImpl, permutes all of the batch dims to the front, // and then returns a physical version of the Tensor. @@ -20,7 +19,7 @@ static Tensor permuteBatchDimsToFront(const BatchedTensorImpl* batched) { if (batched->bdim() == 0) { return physical_tensor; } - const auto sizes = physical_tensor.sizes(); + const auto sizes = physical_tensor.sym_sizes(); VmapDimVector permutation(sizes.size(), 0); permutation.reserve(sizes.size()); const auto is_bdim = createBatchDimBitset(batched->bdim()); @@ -97,14 +96,14 @@ static std::tuple computeFrontBatchDimsFromLevels(std::bitset< return std::make_tuple(dim, level); } -static Tensor moveDimToFrontAndExpand(Tensor tensor, optional dim, int64_t size) { +static Tensor moveDimToFrontAndExpand(Tensor tensor, optional dim, c10::SymInt size) { if (dim) { tensor = tensor.movedim(*dim, 0); } else { tensor = tensor.unsqueeze(0); - auto expanded_sizes = tensor.sizes().vec(); + auto expanded_sizes = tensor.sym_sizes().vec(); expanded_sizes[0] = size; - tensor = tensor.expand(expanded_sizes); + tensor = tensor.expand_symint(expanded_sizes); } return tensor; } @@ -120,7 +119,7 @@ static Tensor moveDimToFrontAndExpand(Tensor tensor, optional dim, int6 VmapPhysicalViewVec MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) { auto cur_level = maybeCurrentDynamicLayer().value().layerId(); - auto bdim_size = -1; + c10::SymInt bdim_size = -1; // Figure out the batch size first for (const auto& logical_tensor : logical_tensors) { @@ -131,12 +130,12 @@ MultiBatchVmapTransform::logicalToPhysical(ITensorListRef logical_tensors) { if (batched->level() != cur_level) { continue; } - bdim_size = batched->value().size(batched->bdim()); + bdim_size = batched->value().sym_size(batched->bdim()); } TORCH_INTERNAL_ASSERT(bdim_size != -1); std::bitset levels; - levels[cur_level] = 1; + levels[cur_level] = true; VmapPhysicalViewVec result; for (const auto& logical_tensor : logical_tensors) { @@ -185,7 +184,7 @@ VmapPhysicalViewVec BroadcastingVmapTransform::logicalToPhysical(TensorList logi TORCH_INTERNAL_ASSERT(bdim_size != -1); std::bitset levels; - levels[cur_level] = 1; + levels[cur_level] = true; // figure out the example ndim int64_t max_example_dim = -1; @@ -227,5 +226,4 @@ void VmapPhysicalToLogicalMap::applyInplace(std::vector& physical_tensor } } -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/LegacyVmapTransforms.h b/aten/src/ATen/functorch/LegacyVmapTransforms.h index 7944d99bae45d..390989d45bf73 100644 --- a/aten/src/ATen/functorch/LegacyVmapTransforms.h +++ b/aten/src/ATen/functorch/LegacyVmapTransforms.h @@ -120,7 +120,7 @@ struct VmapPhysicalToLogicalMap; // levels: 012345 struct TORCH_API VmapPhysicalView { VmapPhysicalView(Tensor&& tensor, std::bitset levels) - : levels_(levels), tensor_(tensor) { + : levels_(levels), tensor_(std::move(tensor)) { // TORCH_INTERNAL_ASSERT(!isBatchedTensor(tensor)); } diff --git a/aten/src/ATen/functorch/PlumbingHelper.cpp b/aten/src/ATen/functorch/PlumbingHelper.cpp index 43ab457d4ad62..76982fd1b6480 100644 --- a/aten/src/ATen/functorch/PlumbingHelper.cpp +++ b/aten/src/ATen/functorch/PlumbingHelper.cpp @@ -9,7 +9,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { void vmap_check_escaped(const optional &layer, const char* what) { TORCH_CHECK( @@ -92,4 +92,4 @@ bool areAnyBatchedAtLevel(ArrayRef> maybe_tensors, int64_t leve } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp index 448145b14e480..355ac5965da51 100644 --- a/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp +++ b/aten/src/ATen/functorch/PyTorchOperatorHacks.cpp @@ -11,7 +11,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { // NOTE: [functorch's PyTorch Operator Hacks] // @@ -167,7 +167,7 @@ namespace dropout_hack { namespace { template -using Ctype = typename std::conditional::type; +using Ctype = std::conditional_t; static Tensor make_feature_noise(const Tensor& input) { auto input_sizes = input.sizes(); @@ -312,4 +312,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchDynamicLayerFrontMode, m) { m.impl("feature_alpha_dropout_", dropout_hack::feature_alpha_dropout_); } -}} +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/TensorWrapper.cpp b/aten/src/ATen/functorch/TensorWrapper.cpp index a8411570801db..4be5725e800f3 100644 --- a/aten/src/ATen/functorch/TensorWrapper.cpp +++ b/aten/src/ATen/functorch/TensorWrapper.cpp @@ -13,8 +13,7 @@ #include -namespace at { -namespace functorch { +namespace at::functorch { void dumpTensor(std::ostream& ss, const Tensor& tensor) { auto* wrapped = maybeGetTensorWrapper(tensor); @@ -51,7 +50,7 @@ void TensorWrapper::refreshMetadata() { void dumpTensorCout(const Tensor& tensor) { dumpTensor(std::cout, tensor); - std::cout << std::endl; + std::cout << '\n'; } static c10::intrusive_ptr makeTensorWrapperPtr(const Tensor& tensor, int64_t level, const std::shared_ptr& life_handle) { @@ -82,6 +81,11 @@ static Tensor unsafeMakeTensorWrapper( auto result = at::detail::make_tensor( key_set, tensor, level, life_handle, is_immutable); TORCH_INTERNAL_ASSERT(result.key_set().has(DispatchKey::FuncTorchGradWrapper)); + + if (tensor.unsafeGetTensorImpl()->is_wrapped_number()) { + result.unsafeGetTensorImpl()->set_wrapped_number(true); + } + return result; } @@ -204,5 +208,4 @@ TORCH_LIBRARY_IMPL(_, FuncTorchGradWrapper, m) { m.fallback(torch::CppFunction::makeFromBoxedFunction<&dead_tensor_wrapper_fallback>()); } -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/VmapInterpreter.cpp b/aten/src/ATen/functorch/VmapInterpreter.cpp index 21f40adb0d140..33774e0433264 100644 --- a/aten/src/ATen/functorch/VmapInterpreter.cpp +++ b/aten/src/ATen/functorch/VmapInterpreter.cpp @@ -1,7 +1,7 @@ #include #include -namespace at { namespace functorch { +namespace at::functorch { void VmapInterpreterPtr::processImpl( const c10::OperatorHandle& op, @@ -21,4 +21,4 @@ void VmapInterpreterPtr::sendToNextInterpreterImpl( op.callBoxed(stack); } -}} // namespace at::functorch +} // namespace at::functorch diff --git a/aten/src/ATen/functorch/VmapModeRegistrations.cpp b/aten/src/ATen/functorch/VmapModeRegistrations.cpp index ad413d48c7ebc..195afd80bc713 100644 --- a/aten/src/ATen/functorch/VmapModeRegistrations.cpp +++ b/aten/src/ATen/functorch/VmapModeRegistrations.cpp @@ -17,8 +17,7 @@ // FuncTorchVmapMode -- these registrations are to error out on operations // that we don't support on regular Tensors. -namespace at { -namespace functorch { +namespace at::functorch { static void unsupportedRandomOp(const c10::OperatorHandle& op, torch::jit::Stack* stack) { TORCH_CHECK(false, "vmap: We do not support calling out variants of random operations inside of vmap. ", @@ -68,6 +67,4 @@ TORCH_LIBRARY_IMPL(aten, FuncTorchVmapMode, m) { NYI_RANDOM(rrelu); } - -} -} // namespace at +} // namespace at::functorch diff --git a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h index c5a607c51b391..8e2654bafe90b 100644 --- a/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPAllocatorMasqueradingAsCUDA.h @@ -15,7 +15,7 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator { public: explicit HIPAllocatorMasqueradingAsCUDA(Allocator* allocator) : allocator_(allocator) {} - DataPtr allocate(size_t size) const override { + DataPtr allocate(size_t size) override { DataPtr r = allocator_->allocate(size); r.unsafe_set_device(Device(c10::DeviceType::CUDA, r.device().index())); return r; @@ -23,6 +23,9 @@ class HIPAllocatorMasqueradingAsCUDA final : public Allocator { DeleterFnPtr raw_deleter() const override { return allocator_->raw_deleter(); } + void copy_data(void* dest, const void* src, std::size_t count) const final { + allocator_->copy_data(dest, src, count); + } }; }} // namespace c10::hip diff --git a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h index 5ef7765519de9..a0fc211e4c8ae 100644 --- a/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPGuardImplMasqueradingAsCUDA.h @@ -88,6 +88,9 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI Stream getDefaultStream(Device d) const override { return getDefaultHIPStreamMasqueradingAsCUDA(d.index()); } + Stream getNewStream(Device d, int priority = 0) const override { + return getStreamFromPoolMasqueradingAsCUDA(priority, d.index()); + } Stream getStreamFromGlobalPool(Device d, bool isHighPriority = false) const override { return getStreamFromPoolMasqueradingAsCUDA(isHighPriority, d.index()); } @@ -120,11 +123,9 @@ struct HIPGuardImplMasqueradingAsCUDA final : public c10::impl::DeviceGuardImplI auto hip_flag = hipEventDefault; switch (flag) { case EventFlag::PYTORCH_DEFAULT: - case EventFlag::HIP_EVENT_DISABLE_TIMING: hip_flag = hipEventDisableTiming; break; case EventFlag::BACKEND_DEFAULT: - case EventFlag::HIP_EVENT_DEFAULT: hip_flag = hipEventDefault; break; default: diff --git a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h index 2b30018b4a888..fb13ada5ad88e 100644 --- a/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h +++ b/aten/src/ATen/hip/impl/HIPStreamMasqueradingAsCUDA.h @@ -96,6 +96,11 @@ inline getStreamFromPoolMasqueradingAsCUDA(const bool isHighPriority = false, De return HIPStreamMasqueradingAsCUDA(getStreamFromPool(isHighPriority, device)); } +HIPStreamMasqueradingAsCUDA +inline getStreamFromPoolMasqueradingAsCUDA(const int priority, DeviceIndex device = -1) { + return HIPStreamMasqueradingAsCUDA(getStreamFromPool(priority, device)); +} + HIPStreamMasqueradingAsCUDA inline getStreamFromExternalMasqueradingAsCUDA(hipStream_t ext_stream, DeviceIndex device) { return HIPStreamMasqueradingAsCUDA(getStreamFromExternal(ext_stream, device)); diff --git a/aten/src/ATen/miopen/AutocastRNN.cpp b/aten/src/ATen/miopen/AutocastRNN.cpp new file mode 100644 index 0000000000000..271d80ea03cd4 --- /dev/null +++ b/aten/src/ATen/miopen/AutocastRNN.cpp @@ -0,0 +1,64 @@ +#include +#include +#include +#include + +namespace at { +namespace autocast { + +/********************************************************************** +Autocast wrapper for MIOpen RNNs +**********************************************************************/ +std::tuple +miopen_rnn(const Tensor & input_r, + TensorList weight, + int64_t weight_stride0, + const Tensor & hx, + const c10::optional& cx_opt, + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_num_layers, + bool batch_first, + double fn_dropout, + bool fn_train, + bool fn_bidirectional, + IntArrayRef fn_batch_sizes, + const c10::optional& fn_dropout_state_opt) { + +#if AT_ROCM_ENABLED() + + c10::impl::ExcludeDispatchKeyGuard no_autocast(DispatchKey::Autocast); + + return at::miopen_rnn( + cached_cast(at::kHalf, input_r), + cached_cast(at::kHalf, weight), + weight_stride0, + cached_cast(at::kHalf, hx), + cached_cast(at::kHalf, cx_opt), + fn_mode, + fn_hidden_size, + fn_num_layers, + batch_first, + fn_dropout, + fn_train, + fn_bidirectional, + fn_batch_sizes, + fn_dropout_state_opt); + +#else + AT_ERROR("autocast::miopen_rnn: ATen not compiled with ROCm enabled"); + return {Tensor{}, Tensor{}, Tensor{}, Tensor{}, Tensor{}}; // placate the compiler +#endif + +} + +// Register Autocast dispatch +namespace { +TORCH_LIBRARY_IMPL(aten, Autocast, m) { + m.impl("miopen_rnn", + TORCH_FN((&at::autocast::miopen_rnn))); +} +} // anonymous namespace + +} // namespace autocast +} // namespace at diff --git a/aten/src/ATen/mkl/Sparse.h b/aten/src/ATen/mkl/Sparse.h index 2763feef47c5c..9a09b042c9fe0 100644 --- a/aten/src/ATen/mkl/Sparse.h +++ b/aten/src/ATen/mkl/Sparse.h @@ -4,7 +4,7 @@ // MKL Sparse is not currently supported on Windows // See https://github.com/pytorch/pytorch/issues/97352 -#if AT_MKL_ENABLED() && (!defined(_WIN32)) +#if AT_MKL_ENABLED() #define AT_USE_MKL_SPARSE() 1 #else #define AT_USE_MKL_SPARSE() 0 diff --git a/aten/src/ATen/mkl/SparseBlas.cpp b/aten/src/ATen/mkl/SparseBlas.cpp index 90a60b42c1bdc..d84e6abb34f23 100644 --- a/aten/src/ATen/mkl/SparseBlas.cpp +++ b/aten/src/ATen/mkl/SparseBlas.cpp @@ -19,9 +19,7 @@ MKL_Complex to_mkl_complex(c10::complex scalar) { } // namespace -// There are link errors when compiling with create_csr functions on Windows. -// See https://github.com/pytorch/pytorch/pull/50937#issuecomment-779272492 -#if !defined(_WIN32) + template <> void create_csr(MKL_SPARSE_CREATE_CSR_ARGTYPES(float)) { TORCH_MKLSPARSE_CHECK(mkl_sparse_s_create_csr( @@ -117,7 +115,6 @@ void create_bsr>( col_indx, reinterpret_cast(values))); } -#endif // !defined(_WIN32) template <> void mv(MKL_SPARSE_MV_ARGTYPES(float)) { @@ -152,7 +149,6 @@ void mv>(MKL_SPARSE_MV_ARGTYPES(c10::complex)) { reinterpret_cast(y))); } -#if !defined(_WIN32) template <> void add(MKL_SPARSE_ADD_ARGTYPES(float)) { TORCH_MKLSPARSE_CHECK(mkl_sparse_s_add(operation, A, alpha, B, C)); @@ -171,7 +167,6 @@ void add>(MKL_SPARSE_ADD_ARGTYPES(c10::complex)) { TORCH_MKLSPARSE_CHECK(mkl_sparse_z_add( operation, A, to_mkl_complex(alpha), B, C)); } -#endif // !defined(_WIN32) template <> void export_csr(MKL_SPARSE_EXPORT_CSR_ARGTYPES(float)) { @@ -251,7 +246,6 @@ void mm>(MKL_SPARSE_MM_ARGTYPES(c10::complex)) { ldc)); } -#if !defined(_WIN32) template <> void spmmd(MKL_SPARSE_SPMMD_ARGTYPES(float)) { TORCH_MKLSPARSE_CHECK(mkl_sparse_s_spmmd( @@ -282,7 +276,6 @@ void spmmd>(MKL_SPARSE_SPMMD_ARGTYPES(c10::complex) reinterpret_cast(C), ldc)); } -#endif template <> void trsv(MKL_SPARSE_TRSV_ARGTYPES(float)) { diff --git a/aten/src/ATen/mps/EmptyTensor.cpp b/aten/src/ATen/mps/EmptyTensor.cpp index 3361cca8201c8..f7918ac18993c 100644 --- a/aten/src/ATen/mps/EmptyTensor.cpp +++ b/aten/src/ATen/mps/EmptyTensor.cpp @@ -43,7 +43,8 @@ TensorBase empty_mps( int64_t nelements = c10::multiply_integers(size); auto dtype = dtype_or_default(dtype_opt); TORCH_CHECK_TYPE(dtype != ScalarType::Double, MPS_ERROR_DOUBLE_NOT_SUPPORTED); - TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16, "BFloat16 is not supported on MPS"); + TORCH_CHECK_TYPE(dtype != ScalarType::BFloat16 || is_macos_13_or_newer(mps::MacOSVersion::MACOS_VER_14_0_PLUS), "MPS BFloat16 is only supported on MacOS 14 or newer"); + auto dtype_meta = scalarTypeToTypeMeta(dtype); int64_t size_bytes = nelements * dtype_meta.itemsize(); diff --git a/aten/src/ATen/mps/MPSAllocator.mm b/aten/src/ATen/mps/MPSAllocator.mm index 5b59cd5d1ddcd..76280fb469e58 100644 --- a/aten/src/ATen/mps/MPSAllocator.mm +++ b/aten/src/ATen/mps/MPSAllocator.mm @@ -748,7 +748,7 @@ DeleterFnPtr raw_deleter() const override { return &Delete; } - DataPtr allocate(const size_t nbytes) const override { + DataPtr allocate(const size_t nbytes) override { __block id buf = nbytes > 0 ? _getAllocImpl().malloc(nbytes, m_usage) : nullptr; return {buf, buf, &Delete, at::Device(at::DeviceType::MPS, 0)}; } @@ -819,6 +819,10 @@ bool waitForEvents(c10::ArrayRef buffers) const override { return _getAllocImpl().format_size(size); } + void copy_data(void* dest, const void* src, std::size_t count) const final { + default_copy_data(dest, src, count); + } + private: bool m_has_unified_memory; uint32_t m_usage; diff --git a/aten/src/ATen/mps/MPSDevice.h b/aten/src/ATen/mps/MPSDevice.h index 40ab07077293d..084820ab42e41 100644 --- a/aten/src/ATen/mps/MPSDevice.h +++ b/aten/src/ATen/mps/MPSDevice.h @@ -22,8 +22,6 @@ typedef void* MTLComputePipelineState_t; typedef void* MTLLibrary_t; #endif -using namespace std; - namespace at::mps { // Helper enum to check if a MPSGraph op is supported in a given macOS version diff --git a/aten/src/ATen/mps/MPSDevice.mm b/aten/src/ATen/mps/MPSDevice.mm index a0e3b70e98769..c6e8fd732e70a 100644 --- a/aten/src/ATen/mps/MPSDevice.mm +++ b/aten/src/ATen/mps/MPSDevice.mm @@ -14,8 +14,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id& device, bool macOS13Plus) { // MPS Advanced Indexing needs at least Metal 2.0 (support for Argument Buffers and function constants) - // host_name attribute needs at least Metal 2.2 - MTLLanguageVersion languageVersion = MTLLanguageVersion2_2; + // host_name attribute needs at least Metal 2.2 and ulong needs Metal 2.3 (supported on MacOS 11+ + MTLLanguageVersion languageVersion = MTLLanguageVersion2_3; #if defined(__MAC_13_0) if (macOS13Plus) { languageVersion = MTLLanguageVersion3_0; diff --git a/aten/src/ATen/mps/MPSHooks.h b/aten/src/ATen/mps/MPSHooks.h index 546b47bf55aa6..667430eaf8114 100644 --- a/aten/src/ATen/mps/MPSHooks.h +++ b/aten/src/ATen/mps/MPSHooks.h @@ -46,6 +46,12 @@ struct MPSHooks : public at::MPSHooksInterface { void synchronizeEvent(uint32_t event_id) const override; bool queryEvent(uint32_t event_id) const override; double elapsedTimeOfEvents(uint32_t start_event_id, uint32_t end_event_id) const override; + + // Compatibility with Accelerator API + bool hasPrimaryContext(DeviceIndex device_index) const override { + // When MPS is available, it is always in use for the one device. + return true; + } }; } // namespace at::mps diff --git a/aten/src/ATen/mps/MPSProfiler.h b/aten/src/ATen/mps/MPSProfiler.h index 994c50ad9e61c..7ee9db5dd3242 100644 --- a/aten/src/ATen/mps/MPSProfiler.h +++ b/aten/src/ATen/mps/MPSProfiler.h @@ -9,12 +9,12 @@ #include #include +#include +#include #include #include -#include #include #include -#include namespace at::mps { @@ -296,9 +296,15 @@ class MPSProfiler { // during runtime (instead of environment variables). // The "mode" could be either "interval", "event", or both "interval,event" // for interval-based and/or event-based signpost tracing. - void StartTrace(const string& mode, bool waitUntilCompleted); + void StartTrace(const std::string& mode, bool waitUntilCompleted); void StopTrace(); + // Abstractions for GPU trace capturing + bool isCaptureEnabled() const; + bool isCapturing() const; + void startCapture(const std::string& name, MPSStream* stream = nullptr); + void stopCapture(MPSStream* stream = nullptr); + // convenience functions to indicate whether signpost tracing or // logging are enabled for the SignpostTypes bool isOperationProfilingEnabled() const { @@ -356,6 +362,9 @@ class MPSProfiler { // a short list that contains copy stats std::unordered_map> m_copy_stat_list{}; + mutable MTLCaptureManager *captureManager = nil; + unsigned captureCount = 0; + void initialize(); void beginProfileExecution(BaseInfo& info, bool cpuExecution = false); void endProfileExecution(BaseInfo& info, os_signpost_id_t event_signpost_id, diff --git a/aten/src/ATen/mps/MPSProfiler.mm b/aten/src/ATen/mps/MPSProfiler.mm index e6e1a7257923b..522328277787b 100644 --- a/aten/src/ATen/mps/MPSProfiler.mm +++ b/aten/src/ATen/mps/MPSProfiler.mm @@ -195,7 +195,7 @@ } } -void MPSProfiler::StartTrace(const string& mode, bool waitUntilCompleted) { +void MPSProfiler::StartTrace(const std::string& mode, bool waitUntilCompleted) { TORCH_CHECK(m_profile_options == ProfileOptions::OPTIONS_NONE, "Tracing Signposts is already enabled "); std::stringstream ss(mode); @@ -765,6 +765,41 @@ struct sigaction MPSProfiler::currentSigint {}; struct sigaction MPSProfiler::previousSigint {}; +bool MPSProfiler::isCapturing() const { + return [captureManager isCapturing]; +} + +bool MPSProfiler::isCaptureEnabled() const { + if (captureManager == nil) { + captureManager = [MTLCaptureManager sharedCaptureManager]; + } + static bool isEnabled = [this]() { + return [captureManager supportsDestination:MTLCaptureDestinationGPUTraceDocument]; + }(); + return isEnabled; +} + +void MPSProfiler::startCapture(const std::string& name, MPSStream* stream) { + if (captureManager == nil) { + captureManager = [MTLCaptureManager sharedCaptureManager]; + } + NSError* err = nil; + NSString* fname = [NSString stringWithFormat:@"%04d-%s.gputrace", captureCount++, name.c_str()]; + MTLCaptureDescriptor* captureDescriptor = [MTLCaptureDescriptor new]; + captureDescriptor.captureObject = stream ? (id)stream->commandQueue() : (id)MPSDevice::getInstance()->device(); + captureDescriptor.destination = MTLCaptureDestinationGPUTraceDocument; + captureDescriptor.outputURL = [NSURL fileURLWithPath:fname]; + auto rc = [captureManager startCaptureWithDescriptor:captureDescriptor error:&err]; + TORCH_CHECK(rc, "Failed to start capture of ", [fname UTF8String], " error ", [[err description] UTF8String]); +} + +void MPSProfiler::stopCapture(MPSStream* stream) { + if (stream) { + stream->synchronize(SyncType::COMMIT); + } + [captureManager stopCapture]; +} + } // namespace Profiler Profiler::MPSProfiler& getMPSProfiler() { diff --git a/aten/src/ATen/mps/MPSStream.mm b/aten/src/ATen/mps/MPSStream.mm index 2ac8b0cc64a36..0542a9fbd4c24 100644 --- a/aten/src/ATen/mps/MPSStream.mm +++ b/aten/src/ATen/mps/MPSStream.mm @@ -22,7 +22,7 @@ @interface MPSGraphExecutionDescriptor () _compilationDescriptor = [MPSGraphCompilationDescriptor new]; // disable commitAndContinue if Signpost tracing is enabled - if (getMPSProfiler().isSignpostTracingEnabled()) { + if (getMPSProfiler().isSignpostTracingEnabled() || getMPSProfiler().isCaptureEnabled()) { _enableCommitAndContinue = false; } _executionDescriptor.enableCommitAndContinue = _enableCommitAndContinue; @@ -173,11 +173,22 @@ @interface MPSGraphExecutionDescriptor () endKernelCoalescing(); id blitEncoder = [commandBuffer() blitCommandEncoder]; - [blitEncoder copyFromBuffer:srcBuffer - sourceOffset:(NSUInteger)srcOffset - toBuffer:dstBuffer - destinationOffset:(NSUInteger)dstOffset - size:(NSUInteger)length]; + // For some reason copyFromBuffer for 4Gb fails without returning an error + // See https://github.com/pytorch/pytorch/issues/124335 + // Workaround by batching copy commands into 2Gb chunks + constexpr size_t max_copy_size = 0x80000000; // 2GB + size_t bytes_copied = 0; + size_t bytes_remains = length; + while (bytes_remains > 0) { + NSUInteger bytes_to_copy = std::min(max_copy_size, bytes_remains); + [blitEncoder copyFromBuffer:srcBuffer + sourceOffset:(NSUInteger)srcOffset + bytes_copied + toBuffer:dstBuffer + destinationOffset:(NSUInteger)dstOffset + bytes_copied + size:bytes_to_copy]; + bytes_copied += bytes_to_copy; + bytes_remains -= bytes_to_copy; + } [blitEncoder endEncoding]; // profilerId has a value only if copy profiling is enabled diff --git a/aten/src/ATen/native/Activation.cpp b/aten/src/ATen/native/Activation.cpp index 7f5c696d1f6e7..533bc32216365 100644 --- a/aten/src/ATen/native/Activation.cpp +++ b/aten/src/ATen/native/Activation.cpp @@ -76,7 +76,6 @@ #include #include #include -#include #include #endif @@ -89,8 +88,8 @@ TORCH_META_FUNC(threshold)(const Tensor& self, const Scalar& threshold, const Sc build(TensorIteratorConfig() .set_check_mem_overlap(false) // threshold is idempotent, so overlap is okay .add_output(result) - .add_input(self) - .add_input(self) // other + .add_const_input(self) + .add_const_input(self) // other .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) @@ -103,8 +102,8 @@ TORCH_META_FUNC(threshold_backward)(const Tensor& grad, const Tensor& self, cons build(TensorIteratorConfig() .set_check_mem_overlap(false) // threshold is idempotent, so overlap is okay .add_output(gradInput) - .add_input(self) - .add_input(grad) // other + .add_const_input(self) + .add_const_input(grad) // other .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) @@ -393,7 +392,7 @@ TORCH_IMPL_FUNC(gelu_out_cpu) ( auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() if (use_mkldnn(self) && (approximate_type == GeluType::None)) { - const ideep::tensor& x = itensor_from_tensor(self); + const ideep::tensor& x = itensor_from_tensor(self, /*from_const_data_ptr*/true); ideep::tensor y = itensor_from_tensor(result); ideep::eltwise_forward::compute( x, y, ideep::algorithm::eltwise_gelu_erf, ideep::prop_kind::forward_training, /*alpha*/ 0.0); @@ -411,8 +410,8 @@ TORCH_IMPL_FUNC(gelu_backward_out_cpu) ( auto approximate_type = get_gelutype_enum(approximate); #if AT_MKLDNN_ENABLED() if (use_mkldnn(self) && (approximate_type == GeluType::None)) { - const ideep::tensor& x = itensor_from_tensor(self); - ideep::tensor grady = itensor_from_tensor(grad); + const ideep::tensor& x = itensor_from_tensor(self, /*from_const_data_ptr*/true); + ideep::tensor grady = itensor_from_tensor(grad, /*from_const_data_ptr*/true); ideep::tensor gradx = itensor_from_tensor(grad_input); ideep::eltwise_backward::compute(x, grady, gradx, ideep::algorithm::eltwise_gelu_erf, /*alpha*/ 0.0); @@ -579,7 +578,7 @@ inline void _rrelu_with_noise_train( opmath_t upper = upper_.to(); Tensor tmp_tensor = output.contiguous(); scalar_t* output_data = tmp_tensor.data_ptr(); - scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* noise_data = noise.data_ptr(); auto gen = at::get_generator_or_default(generator, detail::getDefaultCPUGenerator()); std::lock_guard lock(gen->mutex_); @@ -717,8 +716,8 @@ Tensor _prelu_kernel(const Tensor& self, const Tensor& weight) { auto result = at::empty_like(self); auto iter = TensorIteratorConfig() .add_output(result) - .add_input(self) - .add_input(weight) + .add_const_input(self) + .add_const_input(weight) .build(); prelu_stub(iter.device_type(), iter); return result; @@ -730,9 +729,9 @@ std::tuple _prelu_kernel_backward(const Tensor& grad_out, const auto iter = TensorIteratorConfig() .add_output(grad_self) .add_output(grad_weight) - .add_input(self) - .add_input(weight) - .add_input(grad_out) + .add_const_input(self) + .add_const_input(weight) + .add_const_input(grad_out) .build(); prelu_backward_stub(iter.device_type(), iter); return {grad_self, grad_weight}; @@ -748,9 +747,8 @@ Tensor infinitely_differentiable_gelu_backward( } std::tuple log_sigmoid_forward_cpu(const Tensor& input) { - // FIXME: do these actually need to be zeros_like or can they be empty_like? - auto result = at::zeros_like(input, at::MemoryFormat::Contiguous); - auto buffer = at::zeros_like(input, at::MemoryFormat::Contiguous); + auto result = at::empty_like(input, at::MemoryFormat::Contiguous); + auto buffer = at::empty_like(input, at::MemoryFormat::Contiguous); log_sigmoid_cpu_stub(kCPU, result, buffer, input.contiguous()); return std::make_tuple(result, buffer); } @@ -781,8 +779,8 @@ Tensor log_sigmoid_backward_cuda(const Tensor& grad_output, const Tensor& input, // NOTE: buffer is only used by CPU dispatch, we just ignore it here auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(grad_output) .build(); log_sigmoid_backward_stub(kCUDA, iter); return iter.output(); @@ -792,9 +790,9 @@ Tensor log_sigmoid_backward_cpu(const Tensor& grad_output, const Tensor& input, auto grad_input = at::empty_like(grad_output); auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(buffer) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(buffer) + .add_const_input(grad_output) .build(); log_sigmoid_backward_stub(kCPU, iter); return iter.output(); @@ -804,8 +802,8 @@ Tensor& log_sigmoid_backward_cuda_out(const Tensor& grad_output, const Tensor& i const Tensor& buffer, Tensor& grad_input) { auto iter = TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(grad_output) .build(); log_sigmoid_backward_stub(kCUDA, iter); return grad_input; @@ -817,9 +815,9 @@ Tensor& log_sigmoid_backward_cpu_out(const Tensor& grad_output, Tensor& grad_input) { auto iter = TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(buffer) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(buffer) + .add_const_input(grad_output) .build(); log_sigmoid_backward_stub(kCPU, iter); return grad_input; diff --git a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp index 38a6e2322ab75..bbd4f68d40d09 100644 --- a/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveAveragePooling3d.cpp @@ -25,7 +25,7 @@ namespace { template static void adaptive_avg_pool3d_out_frame( - scalar_t* input_p, + const scalar_t* input_p, scalar_t* output_p, int64_t sizeD, int64_t isizeT, @@ -57,7 +57,7 @@ static void adaptive_avg_pool3d_out_frame( int kW = iendW - istartW; /* local pointers */ - scalar_t* ip = input_p + d * istrideD + istartT * istrideT + + const scalar_t* ip = input_p + d * istrideD + istartT * istrideT + istartH * istrideH + istartW * istrideW; scalar_t* op = output_p + d * osizeT * osizeH * osizeW + ot * osizeH * osizeW + oh * osizeW + ow; @@ -128,7 +128,7 @@ void adaptive_avg_pool3d_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); adaptive_avg_pool3d_out_frame( input_data, @@ -151,7 +151,7 @@ void adaptive_avg_pool3d_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input.scalar_type(), "adaptive_avg_pool3d_cpu", [&] { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) { for (const auto b : c10::irange(start, end)) { @@ -178,7 +178,7 @@ void adaptive_avg_pool3d_out_cpu_template( template static void adaptive_avg_pool3d_backward_out_frame( scalar_t* gradInput_p, - scalar_t* gradOutput_p, + const scalar_t* gradOutput_p, int64_t sizeD, int64_t isizeT, int64_t isizeH, @@ -189,7 +189,7 @@ static void adaptive_avg_pool3d_backward_out_frame( at::parallel_for(0, sizeD, 1, [&](int64_t start, int64_t end) { for (const auto d : c10::irange(start, end)) { scalar_t* gradInput_p_d = gradInput_p + d * isizeT * isizeW * isizeH; - scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH; + const scalar_t* gradOutput_p_d = gradOutput_p + d * osizeT * osizeW * osizeH; /* calculate average */ for (const auto ot : c10::irange(osizeT)) { @@ -251,7 +251,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template( input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] { /* get raw pointers */ scalar_t* gradInput_data = gradInput.data_ptr(); - scalar_t* gradOutput_data = gradOutput.data_ptr(); + const scalar_t* gradOutput_data = gradOutput.const_data_ptr(); adaptive_avg_pool3d_backward_out_frame( gradInput_data, @@ -271,7 +271,7 @@ Tensor& adaptive_avg_pool3d_backward_out_cpu_template( input.scalar_type(), "adaptive_avg_pool3d_backward_cpu", [&] { /* get raw pointers */ scalar_t* gradInput_data = gradInput.data_ptr(); - scalar_t* gradOutput_data = gradOutput.data_ptr(); + const scalar_t* gradOutput_data = gradOutput.const_data_ptr(); at::parallel_for(0, n, 1, [&](int64_t start, int64_t end) { for (const auto b : c10::irange(start, end)) { adaptive_avg_pool3d_backward_out_frame( @@ -317,6 +317,12 @@ Tensor adaptive_avg_pool3d_symint(Tensor const& input, SymIntArrayRef output_siz // in this case, adaptive pooling is just computing mean over hw // dimensions, which can be done more efficiently Tensor out = input.mean({-1, -2, -3}, /* keepdim = */ true); + if (input.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d) { + // assert ndim == 5, since ndim = 4 doesn't give channels_last + const auto n = input.sym_size(0); + const auto c = input.sym_size(1); + out.as_strided__symint({n, c, 1, 1, 1}, {c, 1, c, c, c}); + } return out; } else { return _adaptive_avg_pool3d_symint(input, output_size); diff --git a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp index 78c355d2467a8..001e3c7d2d56e 100644 --- a/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp +++ b/aten/src/ATen/native/AdaptiveMaxPooling3d.cpp @@ -82,7 +82,7 @@ namespace { template static void adaptive_max_pool3d_single_out_frame( - scalar_t *input_p, + const scalar_t *input_p, scalar_t *output_p, int64_t *ind_p, int64_t sizeD, @@ -121,7 +121,7 @@ static void adaptive_max_pool3d_single_out_frame( int64_t kW = iendW - istartW; /* local pointers */ - scalar_t *ip = input_p + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW; + const scalar_t *ip = input_p + d*istrideD + istartT *istrideT + istartH*istrideH + istartW*istrideW; scalar_t *op = output_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; int64_t *indp = ind_p + d*osizeT*osizeH*osizeW + ot*osizeH*osizeW + oh*osizeW + ow; @@ -159,7 +159,7 @@ static void adaptive_max_pool3d_single_out_frame( template static void adaptive_max_pool3d_out_frame( - scalar_t *input_data, + const scalar_t *input_data, scalar_t *output_data, int64_t *indices_data, int64_t sizeB, @@ -192,8 +192,8 @@ static void adaptive_max_pool3d_out_frame( template static void adaptive_max_pool3d_backward_single_out_frame( scalar_t *gradInput_p, - scalar_t *gradOutput_p, - int64_t *ind_p, + const scalar_t *gradOutput_p, + const int64_t *ind_p, int64_t sizeD, int64_t isizeT, int64_t isizeH, @@ -205,8 +205,8 @@ static void adaptive_max_pool3d_backward_single_out_frame( at::parallel_for(0, sizeD, 0, [&](int64_t start, int64_t end) { for (const auto d : c10::irange(start, end)) { scalar_t *gradInput_p_d = gradInput_p + d*isizeT*isizeH*isizeW; - scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW; - int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW; + const scalar_t *gradOutput_p_d = gradOutput_p + d*osizeT*osizeH*osizeW; + const int64_t *ind_p_d = ind_p + d*osizeT*osizeH*osizeW; /* calculate max points */ int64_t ot, oh, ow; @@ -231,8 +231,8 @@ static void adaptive_max_pool3d_backward_single_out_frame( template static void adaptive_max_pool3d_backward_out_frame( scalar_t *gradInput_data, - scalar_t *gradOutput_data, - int64_t *indices_data, + const scalar_t *gradOutput_data, + const int64_t *indices_data, int64_t sizeB, int64_t sizeD, int64_t isizeT, @@ -299,7 +299,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu) if (input.ndimension() == 4) { AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, input.scalar_type(), "adaptive_max_pool3d_cpu", [&] { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -322,7 +322,7 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_out_cpu) } else { AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, input.scalar_type(), "adaptive_max_pool3d_cpu", [&] { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -394,8 +394,8 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu) input.scalar_type(), "adaptive_max_pool3d_backward", [&] { /* get raw pointers */ scalar_t* gradInput_data = gradInput.data_ptr(); - scalar_t* gradOutput_data = gradOutput_.data_ptr(); - int64_t* indices_data = indices.data_ptr(); + const scalar_t* gradOutput_data = gradOutput_.const_data_ptr(); + const int64_t* indices_data = indices.const_data_ptr(); adaptive_max_pool3d_backward_single_out_frame( gradInput_data, @@ -414,8 +414,8 @@ TORCH_IMPL_FUNC(adaptive_max_pool3d_backward_out_cpu) input.scalar_type(), "adaptive_max_pool3d_backward", [&] { /* get raw pointers */ scalar_t* gradInput_data = gradInput.data_ptr(); - scalar_t* gradOutput_data = gradOutput_.data_ptr(); - int64_t* indices_data = indices.data_ptr(); + const scalar_t* gradOutput_data = gradOutput_.const_data_ptr(); + const int64_t* indices_data = indices.const_data_ptr(); adaptive_max_pool3d_backward_out_frame( gradInput_data, diff --git a/aten/src/ATen/native/AdaptivePooling.h b/aten/src/ATen/native/AdaptivePooling.h index d342d218e449a..bb2fda9906abe 100644 --- a/aten/src/ATen/native/AdaptivePooling.h +++ b/aten/src/ATen/native/AdaptivePooling.h @@ -8,15 +8,25 @@ namespace at::native { -using adaptive_avg_pooling_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); -using adaptive_avg_pooling_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); -DECLARE_DISPATCH(adaptive_avg_pooling_fn, adaptive_avg_pool2d_kernel); -DECLARE_DISPATCH(adaptive_avg_pooling_backward_fn, adaptive_avg_pool2d_backward_kernel); - -using adaptive_max_pooling_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); -using adaptive_max_pooling_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); -DECLARE_DISPATCH(adaptive_max_pooling_fn, adaptive_max_pool2d_kernel); -DECLARE_DISPATCH(adaptive_max_pooling_backward_fn, adaptive_max_pool2d_backward_kernel); +using adaptive_avg_pooling2d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling2d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling2d_fn, adaptive_avg_pool2d_kernel); +DECLARE_DISPATCH(adaptive_avg_pooling2d_backward_fn, adaptive_avg_pool2d_backward_kernel); + +using adaptive_max_pooling2d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); +using adaptive_max_pooling2d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); +DECLARE_DISPATCH(adaptive_max_pooling2d_fn, adaptive_max_pool2d_kernel); +DECLARE_DISPATCH(adaptive_max_pooling2d_backward_fn, adaptive_max_pool2d_backward_kernel); + +using adaptive_avg_pooling3d_fn = void(*)(Tensor& output, const Tensor& input, IntArrayRef output_size); +using adaptive_avg_pooling3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output); +DECLARE_DISPATCH(adaptive_avg_pooling3d_fn, adaptive_avg_pool3d_kernel); +DECLARE_DISPATCH(adaptive_avg_pooling3d_backward_fn, adaptive_avg_pool3d_backward_kernel); + +using adaptive_max_pooling3d_fn = void(*)(const Tensor& output, const Tensor& indices, const Tensor& input, IntArrayRef output_size); +using adaptive_max_pooling3d_backward_fn = void(*)(const Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); +DECLARE_DISPATCH(adaptive_max_pooling3d_fn, adaptive_max_pool3d_kernel); +DECLARE_DISPATCH(adaptive_max_pooling3d_backward_fn, adaptive_max_pool3d_backward_kernel); static inline int64_t start_index(int64_t a, int64_t b, int64_t c) { return (a / b) * c + ((a % b) * c) / b; diff --git a/aten/src/ATen/native/AffineGridGenerator.cpp b/aten/src/ATen/native/AffineGridGenerator.cpp index 17e45acb1bb76..315027d7069b4 100644 --- a/aten/src/ATen/native/AffineGridGenerator.cpp +++ b/aten/src/ATen/native/AffineGridGenerator.cpp @@ -110,7 +110,7 @@ static Tensor affine_grid_generator_4D_backward( AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, H, W, 2})); auto grad_theta = base_grid.view({N, H * W, 3}) .transpose(1, 2) - .bmm(grad_grid.view({N, H * W, 2})); + .bmm(grad_grid.reshape({N, H * W, 2})); return grad_theta.transpose(1, 2); } @@ -126,7 +126,7 @@ static Tensor affine_grid_generator_5D_backward( AT_ASSERT(grad_grid.sizes() == IntArrayRef({N, D, H, W, 3})); auto grad_theta = base_grid.view({N, D * H * W, 4}) .transpose(1, 2) - .bmm(grad_grid.view({N, D * H * W, 3})); + .bmm(grad_grid.reshape({N, D * H * W, 3})); return grad_theta.transpose(1, 2); } diff --git a/aten/src/ATen/native/AmpKernels.cpp b/aten/src/ATen/native/AmpKernels.cpp new file mode 100644 index 0000000000000..32248c943193a --- /dev/null +++ b/aten/src/ATen/native/AmpKernels.cpp @@ -0,0 +1,41 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif + +namespace at::native { + +void _amp_foreach_non_finite_check_and_unscale_cpu_( + TensorList scaled_grads, + at::Tensor& found_inf, + const at::Tensor& inv_scale) { + _amp_foreach_non_finite_check_and_unscale_cpu_stub( + found_inf.device().type(), scaled_grads, found_inf, inv_scale); +} + +at::Tensor& _amp_update_scale_cpu_ ( + at::Tensor& current_scale, + at::Tensor& growth_tracker, + const at::Tensor& found_inf, + double growth_factor, + double backoff_factor, + int64_t growth_interval) { + return _amp_update_scale_cpu_stub( + growth_tracker.device().type(), current_scale, growth_tracker, + found_inf, growth_factor, backoff_factor, growth_interval); +} + +DEFINE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu_stub); +DEFINE_DISPATCH(_amp_update_scale_cpu_stub); + +} // namespace at::native diff --git a/aten/src/ATen/native/AmpKernels.h b/aten/src/ATen/native/AmpKernels.h new file mode 100644 index 0000000000000..c463c80e1c6dc --- /dev/null +++ b/aten/src/ATen/native/AmpKernels.h @@ -0,0 +1,28 @@ +#pragma once + +#include +#include + +namespace at { +class Tensor; + +namespace native { + +using _amp_foreach_non_finite_check_and_unscale_cpu__fn = void (*)( + TensorList, + Tensor&, + const Tensor&); + +using _amp_update_scale_cpu__fn = Tensor& (*)( + Tensor&, + Tensor&, + const Tensor&, + double, + double, + int64_t); + +DECLARE_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu__fn, _amp_foreach_non_finite_check_and_unscale_cpu_stub); +DECLARE_DISPATCH(_amp_update_scale_cpu__fn, _amp_update_scale_cpu_stub); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/AutogradComposite.cpp b/aten/src/ATen/native/AutogradComposite.cpp index c97c7e2b139a4..dc98c90a596dd 100644 --- a/aten/src/ATen/native/AutogradComposite.cpp +++ b/aten/src/ATen/native/AutogradComposite.cpp @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -10,6 +11,7 @@ #include #include #include +#include #include #include #endif @@ -41,17 +43,17 @@ Tensor _new_zeros_with_same_feature_meta( const at::Tensor& self, const at::Tensor& other, int64_t self_num_batch_dims) { - auto other_sizes = other.sizes(); - auto other_strides = other.strides(); + auto other_sizes = other.sym_sizes(); + auto other_strides = other.sym_strides(); auto other_storage_offset = other.storage_offset(); - int64_t other_storage_numel = other.storage().nbytes() / other.itemsize(); + auto other_storage_numel = other.storage().sym_nbytes() / other.itemsize(); if (self_num_batch_dims == 0) { - auto new_tensor = at::zeros({other_storage_numel}, other.options()); - return new_tensor.as_strided(other_sizes, other_strides, other_storage_offset); + auto new_tensor = at::zeros_symint({other_storage_numel}, other.options()); + return new_tensor.as_strided_symint(other_sizes, other_strides, other_storage_offset); } - auto self_sizes = self.sizes(); + auto self_sizes = self.sym_sizes(); // NB: We don't check that the sizes of self is the same as that of other // because this function is also used in the inplace over view case @@ -63,14 +65,14 @@ Tensor _new_zeros_with_same_feature_meta( // this case. constexpr int64_t kSmallBufferSizeHint = 8; - auto out_sizes = c10::SmallBuffer(other.dim() + self_num_batch_dims); + auto out_sizes = c10::SmallVector(other.dim() + self_num_batch_dims); std::copy(self_sizes.begin(), self_sizes.begin() + self_num_batch_dims, out_sizes.begin()); std::copy(other_sizes.begin(), other_sizes.end(), out_sizes.begin() + self_num_batch_dims); // We use the strides of other, and tack on the strides computed with // the batch dims of self, so that the slices are arranged contiguously - auto out_strides = c10::SmallBuffer(other.dim() + self_num_batch_dims); - int64_t prod = other_storage_numel; + auto out_strides = c10::SmallVector(other.dim() + self_num_batch_dims); + auto prod = other_storage_numel; for (int64_t i = self_num_batch_dims - 1; i >= 0; --i) { out_strides[i] = prod; @@ -78,15 +80,30 @@ Tensor _new_zeros_with_same_feature_meta( } std::copy(other_strides.begin(), other_strides.end(), out_strides.begin() + self_num_batch_dims); - int64_t storage_numel = prod; + auto storage_numel = prod; // Inherit the TensorOptions of the primal - auto new_tensor = at::zeros({storage_numel}, other.options()); - return new_tensor.as_strided(out_sizes, out_strides, other_storage_offset); + auto new_tensor = at::zeros_symint({storage_numel}, other.options()); + return new_tensor.as_strided_symint(out_sizes, out_strides, other_storage_offset); } bool _has_same_storage_numel(const at::Tensor& base, const at::Tensor& other) { - return base.storage().nbytes() / base.itemsize() == other.storage().nbytes() / other.itemsize(); + return base.storage().sym_nbytes() / base.itemsize() == other.storage().sym_nbytes() / other.itemsize(); +} + +Tensor _lazy_clone(Tensor const& self) { + c10::StorageImpl* self_storage = self.storage().unsafeGetStorageImpl(); + c10::intrusive_ptr storage = + c10::impl::cow::lazy_clone_storage(*self_storage); + TORCH_CHECK(storage != nullptr); + auto tensor = c10::make_intrusive( + c10::Storage(std::move(storage)), + self.key_set(), + self.dtype()); + tensor->set_sizes_and_strides(self.sym_sizes(), + self.sym_strides(), + self.sym_storage_offset()); + return Tensor(std::move(tensor)); } } // namespace at::native diff --git a/aten/src/ATen/native/AveragePool3d.cpp b/aten/src/ATen/native/AveragePool3d.cpp index 110095e6fff00..c2d7b44a5076c 100644 --- a/aten/src/ATen/native/AveragePool3d.cpp +++ b/aten/src/ATen/native/AveragePool3d.cpp @@ -155,7 +155,7 @@ namespace { template static void avg_pool3d_out_frame( - scalar_t *input_p, + const scalar_t *input_p, scalar_t *output_p, int64_t nslices, int64_t itime, @@ -182,7 +182,7 @@ static void avg_pool3d_out_frame( int64_t i, j, ti; /* local pointers. */ - scalar_t *ip = input_p + k * itime * iwidth * iheight; + const scalar_t *ip = input_p + k * itime * iwidth * iheight; scalar_t *op = output_p + k * otime * owidth * oheight; for (i = 0; i < otime * oheight * owidth; ++i) *(op + i) = 0; @@ -295,7 +295,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool3d_out_frame", [&] { - scalar_t *input_data = input.data_ptr(); + const scalar_t *input_data = input.const_data_ptr(); scalar_t *output_data = output.data_ptr(); avg_pool3d_out_frame( @@ -318,7 +318,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cpu) ( AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Long, input.scalar_type(), "avg_pool3d_out_frame", [&] { - scalar_t *input_data = input.data_ptr(); + const scalar_t *input_data = input.const_data_ptr(); scalar_t *output_data = output.data_ptr(); at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { @@ -344,7 +344,7 @@ namespace { template static void avg_pool3d_backward_out_frame( scalar_t *gradInput_p, - scalar_t *gradOutput_p, + const scalar_t *gradOutput_p, int64_t nslices, int64_t itime, int64_t iwidth, @@ -371,7 +371,7 @@ static void avg_pool3d_backward_out_frame( /* local pointers */ scalar_t *ip = gradInput_p + k * itime * iwidth * iheight; - scalar_t *op = gradOutput_p + k * otime * owidth * oheight; + const scalar_t *op = gradOutput_p + k * otime * owidth * oheight; for (i = 0; i < itime*iwidth*iheight; i++) *(ip + i) = 0; @@ -479,7 +479,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) ( "avg_pool3d_backward_out_frame", [&] { scalar_t *gradInput_data = gradInput.data_ptr(); - scalar_t *gradOutput_data = gradOutput.data_ptr(); + const scalar_t *gradOutput_data = gradOutput.const_data_ptr(); avg_pool3d_backward_out_frame( gradInput_data, gradOutput_data, @@ -503,7 +503,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cpu) ( "avg_pool3d_backward_out_frame", [&] { scalar_t *gradInput_data = gradInput.data_ptr(); - scalar_t *gradOutput_data = gradOutput.data_ptr(); + const scalar_t *gradOutput_data = gradOutput.const_data_ptr(); at::parallel_for(0, nbatch, 0, [&](int64_t start, int64_t end) { for (const auto p : c10::irange(start, end)) { diff --git a/aten/src/ATen/native/BatchLinearAlgebra.cpp b/aten/src/ATen/native/BatchLinearAlgebra.cpp index 0719cd3ab5600..40e6b34dc9725 100644 --- a/aten/src/ATen/native/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebra.cpp @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include #include @@ -463,8 +465,7 @@ TORCH_META_FUNC(linalg_ldl_solve) " does not match b dtype ", B.scalar_type()); - std::vector B_broadcast_size; - std::tie(B_broadcast_size, std::ignore) = at::native::_linalg_broadcast_batch_dims(B, LD); + auto [B_broadcast_size, _] = at::native::_linalg_broadcast_batch_dims(B, LD); // prefer column major strides auto result_strides = at::native::batched_matrix_contiguous_strides(B_broadcast_size, /*column_major=*/true); @@ -480,8 +481,7 @@ TORCH_META_FUNC(triangular_solve)(const Tensor& self, const Tensor& A, bool uppe at::native::linearSolveCheckInputs(self, A, "triangular_solve"); if (A.layout() == Layout::Strided) { - std::vector self_broadcast_size, A_broadcast_size; - std::tie(self_broadcast_size, A_broadcast_size) = at::native::_linalg_broadcast_batch_dims(self, A); + auto [self_broadcast_size, A_broadcast_size] = at::native::_linalg_broadcast_batch_dims(self, A); // make column major strides for BLAS const auto solution_strides = at::native::batched_matrix_contiguous_strides(self_broadcast_size, /*f-contig=*/true); @@ -629,8 +629,7 @@ TORCH_META_FUNC(linalg_qr)(const Tensor& A, c10::string_view mode) { at::native::checkIsMatrix(A, "linalg.qr"); at::native::checkFloatingOrComplex(A, "linalg.qr"); - bool compute_q, reduced_mode; - std::tie(compute_q, reduced_mode) = at::native::_parse_qr_mode(mode); + auto [compute_q, reduced_mode] = at::native::_parse_qr_mode(mode); auto A_shape = A.sizes().vec(); const auto m = A_shape.cend()[-2]; @@ -1517,7 +1516,7 @@ void _linalg_check_errors( } else { // Find the first non-zero info auto infos_cpu = infos.to(at::kCPU); - auto ptr = infos_cpu.data_ptr(); + auto ptr = infos_cpu.const_data_ptr(); auto n = infos.numel(); auto info_ptr = std::find_if(ptr, ptr + n, [](int32_t x) { return x != 0; }); info = *info_ptr; @@ -1604,8 +1603,7 @@ Tensor& linalg_inv_out(const Tensor& A, Tensor& result) { } Tensor linalg_inv(const Tensor& A) { - Tensor result, info; - std::tie(result, info) = at::linalg_inv_ex(A); + auto [result, info] = at::linalg_inv_ex(A); at::_linalg_check_errors(info, "linalg.inv", A.dim() == 2); return result; } @@ -1627,7 +1625,7 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, Tensor& infos #else char uplo = upper ? 'U' : 'L'; - auto A_data = A.data_ptr(); + auto A_data = A.const_data_ptr(); auto b_data = b.data_ptr(); auto infos_data = infos.data_ptr(); auto A_mat_stride = matrixStride(A); @@ -1640,9 +1638,9 @@ static void apply_cholesky_solve(Tensor& b, Tensor& A, bool upper, Tensor& infos // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int info; for (const auto i : c10::irange(batch_size)) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + const scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; scalar_t* b_working_ptr = &b_data[i * b_mat_stride]; - lapackCholeskySolve(uplo, n, nrhs, A_working_ptr, ldab, b_working_ptr, ldab, &info); + lapackCholeskySolve(uplo, n, nrhs, const_cast(A_working_ptr), ldab, b_working_ptr, ldab, &info); infos_data[i] = info; if (info != 0) { return; @@ -1669,8 +1667,7 @@ Tensor cholesky_solve(const Tensor& self, const Tensor& A, bool upper) { "b should have at least 2 dimensions, but has ", self.dim(), " dimensions instead"); TORCH_CHECK(A.dim() >= 2, "u should have at least 2 dimensions, but has ", A.dim(), " dimensions instead"); - Tensor self_broadcasted, A_broadcasted; - std::tie(self_broadcasted, A_broadcasted) = _linalg_broadcast_batch_dims(self, A, "cholesky_solve"); + auto [self_broadcasted, A_broadcasted] = _linalg_broadcast_batch_dims(self, A, "cholesky_solve"); return at::_cholesky_solve_helper(self_broadcasted, A_broadcasted, upper); } @@ -1783,8 +1780,7 @@ TORCH_IMPL_FUNC(linalg_cholesky_ex_out)(const Tensor& A, } Tensor linalg_cholesky(const Tensor& A, bool upper) { - Tensor L, info; - std::tie(L, info) = at::linalg_cholesky_ex(A, upper, /*check_errors=*/false); + auto [L, info] = at::linalg_cholesky_ex(A, upper, /*check_errors=*/false); at::_linalg_check_errors(info, "linalg.cholesky", A.dim() == 2); return L; } @@ -1921,8 +1917,7 @@ std::tuple linalg_solve_ex(const Tensor& A, const Tensor& B, bool left, bool check_errors) { - Tensor result, LU, pivots, info; - std::tie(result, LU, pivots, info) = at::_linalg_solve_ex(A, B, left, check_errors); + auto [result, LU, pivots, info] = at::_linalg_solve_ex(A, B, left, check_errors); return std::make_tuple(std::move(result), std::move(info)); } @@ -1939,8 +1934,7 @@ Tensor& linalg_solve_out(const Tensor& A, Tensor linalg_solve(const Tensor& A, const Tensor& B, bool left) { - Tensor result, info; - std::tie(result, info) = at::linalg_solve_ex(A, B, left); + auto [result, info] = at::linalg_solve_ex(A, B, left); at::_linalg_check_errors(info, "torch.linalg.solve", A.dim() == 2); return result; } @@ -1980,8 +1974,7 @@ std::tuple linalg_lu_factor_out(const Tensor& A, bool pivot, T } std::tuple linalg_lu_factor(const Tensor& A, bool pivot) { - Tensor LU, pivots, info; - std::tie(LU, pivots, info) = at::linalg_lu_factor_ex(A, pivot, /*check_errors=*/false); + auto [LU, pivots, info] = at::linalg_lu_factor_ex(A, pivot, /*check_errors=*/false); at::_linalg_check_errors(info, "torch.linalg.lu_factor", A.dim() == 2); return std::make_tuple(std::move(LU), std::move(pivots)); } @@ -2088,7 +2081,7 @@ TORCH_IMPL_FUNC(lu_unpack_out)(const Tensor& LU, .resize_outputs(false) .declare_static_shape(pivots.sizes(), /*squash_dim=*/pivots.dim() - 1) .add_output(perm) - .add_owned_input(pivots.contiguous()) + .add_owned_const_input(pivots.contiguous()) .build(); unpack_pivots_stub(pivots.device().type(), iter, std::min(m, n), m); @@ -2237,8 +2230,7 @@ static void triangular_solve_out_impl( } TORCH_IMPL_FUNC(triangular_solve_out)(const Tensor& self, const Tensor& A, bool upper, bool transpose, bool unitriangular, const Tensor& result, const Tensor& clone_A) { - Tensor self_broadcast, A_broadcast; - std::tie(self_broadcast, A_broadcast) = _linalg_broadcast_batch_dims(self, A, "triangular_solve"); + auto [self_broadcast, A_broadcast] = _linalg_broadcast_batch_dims(self, A, "triangular_solve"); bool copy_needed = !result.transpose(-2, -1).is_contiguous(); copy_needed |= !clone_A.transpose(-2, -1).is_contiguous(); @@ -2370,8 +2362,7 @@ TORCH_IMPL_FUNC(linalg_qr_out)(const Tensor& A, auto m = A.size(-2); auto n = A.size(-1); auto k = std::min(m, n); - bool compute_q, reduced_mode; - std::tie(compute_q, reduced_mode) = at::native::_parse_qr_mode(mode); + auto [compute_q, reduced_mode] = at::native::_parse_qr_mode(mode); // We need an auxiliary tensor to call geqrf @@ -2783,7 +2774,7 @@ Tensor linalg_eigvalsh(const Tensor& A, c10::string_view uplo) { Tensor& linalg_eigvalsh_out(const Tensor& A, c10::string_view uplo, Tensor& L) { auto V = at::empty({0}, A.options()); - at::_linalg_eigh_out(L, V, A, uplo, /*comptue_v=*/false); + at::_linalg_eigh_out(L, V, A, uplo, /*compute_v=*/false); return L; } @@ -2803,13 +2794,13 @@ static void linalg_eig_make_complex_eigenvectors_impl(Tensor& result, const Tens auto matrix_stride = matrixStride(real_vectors); auto result_data = result.data_ptr>(); - auto real_vectors_data = real_vectors.data_ptr(); - auto values_data = complex_values.data_ptr>(); + auto real_vectors_data = real_vectors.const_data_ptr(); + auto values_data = complex_values.const_data_ptr>(); for (auto b = decltype(batch_size){0}; b < batch_size; b++) { - scalar_t* vecs = &real_vectors_data[b * matrix_stride]; + const scalar_t* vecs = &real_vectors_data[b * matrix_stride]; c10::complex* res = &result_data[b * matrix_stride]; - c10::complex* vals = &values_data[b * n]; + const c10::complex* vals = &values_data[b * n]; for (auto j = decltype(n){0}; j < n; j++) { if (vals[j].imag() == 0.0) { // eigenvalue is real, then v(j) = VR(:,j) for (auto i = decltype(n){0}; i < n; i++) { @@ -3111,12 +3102,13 @@ Tensor linalg_eigvals(const Tensor& input) { if (_may_require_fw_or_bw_grad(input)) { return std::get<0>(at::linalg_eig(input)); } + return at::_linalg_eigvals(input); +} +Tensor _linalg_eigvals(const Tensor& input) { ScalarType complex_dtype = toComplexType(input.scalar_type()); Tensor values = at::empty({0}, input.options().dtype(complex_dtype)); - - at::linalg_eigvals_outf(input, values); - + linalg_eigvals_out(input, values); return values; } @@ -3164,7 +3156,7 @@ TORCH_IMPL_FUNC(_linalg_svd_out)(const Tensor& A, TORCH_CHECK(use_cusolver || !driver.has_value(), "torch.linalg.svd: keyword argument `driver=` is only supported on CUDA inputs with cuSOLVER backend."); - // A always needs to be copied as its contents will be destroyed during the computaton of the SVD + // A always needs to be copied as its contents will be destroyed during the computation of the SVD // Now, MAGMA needs the copy to be on CPU, while cuSOLVER needs it to be on CUDA, so we'll defer // the copy as a column major matrix to the backends. const auto info = at::zeros(IntArrayRef(A.sizes().begin(), A.sizes().end() - 2), A.options().dtype(kInt)); @@ -3213,7 +3205,7 @@ Tensor& linalg_svdvals_out(const Tensor& A, c10::optional driv // Dummies auto U = at::empty({0}, A.options()); auto Vh = at::empty({0}, A.options()); - at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*comptue_uv=*/false, /*driver=*/driver); + at::_linalg_svd_out(U, S, Vh, A, /*full_matrices=*/false, /*compute_uv=*/false, /*driver=*/driver); return S; } @@ -3740,8 +3732,7 @@ std::tuple linalg_ldl_factor_out( std::tuple linalg_ldl_factor( const Tensor& self, bool hermitian) { - Tensor LD, pivots, info; - std::tie(LD, pivots, info) = + auto [LD, pivots, info] = at::linalg_ldl_factor_ex(self, hermitian, /*check_errors=*/false); at::_linalg_check_errors(info, "torch.linalg.ldl_factor", self.dim() == 2); return std::make_tuple(std::move(LD), std::move(pivots)); @@ -3820,8 +3811,7 @@ Tensor& linalg_solve_triangular_out( bool unitriangular, Tensor& out) { checkInputsSolver(A, B, left, "linalg.solve_triangular"); - Tensor A_, B_; - std::tie(B_, A_) = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr); + auto [B_, A_] = _linalg_broadcast_batch_dims(B, A, /*don't check errors*/nullptr); // We'll write F-contig / F-transpose for FORTRAN contiguous / FORTRAN transpose etc // We say that a matrix is F-ready if it's F-contig OR F-transpose @@ -3913,7 +3903,7 @@ Tensor& linalg_solve_triangular_out( } // No need to conjugate anything if out_f is conj as AX = conj(B) <=> conj(A)conj(X) = B - // and X = B after the algortihm. We just anotate that A is conjugated later on + // and X = B after the algorithm. We just annotate that A is conjugated later on // The solution will be written into out_f, so it'll be conjugated already Tensor A_f = std::move(A_); // The A that will go into fortran @@ -3922,7 +3912,7 @@ Tensor& linalg_solve_triangular_out( bool A_is_neg = A_f.is_neg() != out_f.is_neg(); bool A_is_f_contig = (A_f.stride(-1) == 1) == transpose_A; if C10_UNLIKELY (!is_row_or_column_contiguous(A_f)) { - // We first anotate with flags on A_f all the conj / transpose / neg coming from out + // We first annotate with flags on A_f all the conj / transpose / neg coming from out // and then we clone the resulting tensor to resolve all of them in memory if (out_f.is_conj()) { A_f = A_f.conj(); diff --git a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp index 8a04a66f388b1..f29970afe2b44 100644 --- a/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp +++ b/aten/src/ATen/native/BatchLinearAlgebraKernel.cpp @@ -402,7 +402,7 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) { using value_t = typename c10::scalar_value_type::type; auto self_data = self.data_ptr(); - auto tau_data = tau.data_ptr(); + auto tau_data = tau.const_data_ptr(); auto self_matrix_stride = matrixStride(self); auto tau_stride = tau.size(-1); auto batch_size = batchCount(self); @@ -423,17 +423,17 @@ inline void apply_orgqr(Tensor& self, const Tensor& tau) { // and (batch_size - 1) calls to allocate and deallocate workspace using at::empty() int lwork = -1; scalar_t wkopt; - lapackOrgqr(m, n, k, self_data, lda, tau_data, &wkopt, lwork, &info); + lapackOrgqr(m, n, k, self_data, lda, const_cast(tau_data), &wkopt, lwork, &info); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0); lwork = std::max(1, real_impl(wkopt)); Tensor work = at::empty({lwork}, self.options()); for (const auto i : c10::irange(batch_size)) { scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; + const scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; // now compute the actual Q - lapackOrgqr(m, n, k, self_working_ptr, lda, tau_working_ptr, work.data_ptr(), lwork, &info); + lapackOrgqr(m, n, k, self_working_ptr, lda, const_cast(tau_working_ptr), work.data_ptr(), lwork, &info); // info from lapackOrgqr only reports if the i-th parameter is wrong // so we don't need to check it all the time @@ -649,8 +649,8 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo char side = left ? 'L' : 'R'; char trans = transpose ? (input.is_complex() ? 'C' : 'T') : 'N'; - auto input_data = input.data_ptr(); - auto tau_data = tau.data_ptr(); + auto input_data = input.const_data_ptr(); + auto tau_data = tau.const_data_ptr(); auto other_data = other.data_ptr(); auto input_matrix_stride = matrixStride(input); @@ -670,21 +670,21 @@ void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& other, bo // Query for the optimal size of the workspace tensor int lwork = -1; scalar_t wkopt; - lapackOrmqr(side, trans, m, n, k, input_data, lda, tau_data, other_data, ldc, &wkopt, lwork, &info); + lapackOrmqr(side, trans, m, n, k, const_cast(input_data), lda, const_cast(tau_data), other_data, ldc, &wkopt, lwork, &info); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0); lwork = std::max(1, real_impl(wkopt)); Tensor work = at::empty({lwork}, input.options()); for (const auto i : c10::irange(batch_size)) { - scalar_t* input_working_ptr = &input_data[i * input_matrix_stride]; + const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride]; scalar_t* other_working_ptr = &other_data[i * other_matrix_stride]; - scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; + const scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; // now compute the actual result lapackOrmqr( side, trans, m, n, k, - input_working_ptr, lda, - tau_working_ptr, + const_cast(input_working_ptr), lda, + const_cast(tau_working_ptr), other_working_ptr, ldc, work.data_ptr(), lwork, &info); @@ -725,7 +725,7 @@ void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool up char side = left ? 'L' : 'R'; const char trans = to_blas(transpose); - auto A_data = A.data_ptr(); + auto A_data = A.const_data_ptr(); auto B_data = B.data_ptr(); auto A_mat_stride = matrixStride(A); auto B_mat_stride = matrixStride(B); @@ -737,9 +737,9 @@ void apply_triangular_solve(const Tensor& A, const Tensor& B, bool left, bool up auto ldb = std::max(1, B.size(-2)); for (const auto i : c10::irange(batch_size)) { - scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; + const scalar_t* A_working_ptr = &A_data[i * A_mat_stride]; scalar_t* B_working_ptr = &B_data[i * B_mat_stride]; - blasTriangularSolve(side, uplo, trans, diag, m, n, A_working_ptr, lda, B_working_ptr, ldb); + blasTriangularSolve(side, uplo, trans, diag, m, n, const_cast(A_working_ptr), lda, B_working_ptr, ldb); } #endif } @@ -841,26 +841,26 @@ void apply_ldl_solve( auto b_stride = B.dim() > 2 ? B.stride(-3) : 0; auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; - auto a_data = A.data_ptr(); + auto a_data = A.const_data_ptr(); auto b_data = B.data_ptr(); auto pivots_ = pivots.to(kInt); - auto pivots_data = pivots_.data_ptr(); + auto pivots_data = pivots_.const_data_ptr(); auto ldl_solve_func = hermitian ? lapackLdlSolveHermitian : lapackLdlSolveSymmetric; int info = 0; for (const auto i : c10::irange(batch_size)) { - scalar_t* a_working_ptr = &a_data[i * a_stride]; + const scalar_t* a_working_ptr = &a_data[i * a_stride]; scalar_t* b_working_ptr = &b_data[i * b_stride]; - auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + const auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; ldl_solve_func( uplo, n, nrhs, - a_working_ptr, + const_cast(a_working_ptr), lda, - pivots_working_ptr, + const_cast(pivots_working_ptr), b_working_ptr, ldb, &info); @@ -968,9 +968,9 @@ void apply_lu_solve(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tra "PyTorch with LAPACK. Please use PyTorch built with LAPACK support."); #else auto b_data = B.data_ptr(); - auto lu_data = LU.data_ptr(); + auto lu_data = LU.const_data_ptr(); const auto trans = to_blas(transpose); - auto pivots_data = pivots.data_ptr(); + auto pivots_data = pivots.const_data_ptr(); auto b_stride = matrixStride(B); auto lu_stride = LU.dim() > 2 ? LU.stride(-3) : 0; auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; @@ -992,10 +992,10 @@ void apply_lu_solve(const Tensor& LU, const Tensor& pivots, const Tensor& B, Tra for (const auto i : c10::irange(batch_size)) { int64_t lu_index_i = lu_index(i); scalar_t* b_working_ptr = &b_data[i * b_stride]; - scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride]; - int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride]; + const scalar_t* lu_working_ptr = &lu_data[lu_index_i * lu_stride]; + const int* pivots_working_ptr = &pivots_data[lu_index_i * pivots_stride]; - lapackLuSolve(trans, n, nrhs, lu_working_ptr, leading_dimension, pivots_working_ptr, + lapackLuSolve(trans, n, nrhs, const_cast(lu_working_ptr), leading_dimension, const_cast(pivots_working_ptr), b_working_ptr, leading_dimension, &info); // info from lapackLuSolve only reports if the i-th parameter is wrong diff --git a/aten/src/ATen/native/BinaryOps.cpp b/aten/src/ATen/native/BinaryOps.cpp index 8816f9622d85c..78f57470a922d 100644 --- a/aten/src/ATen/native/BinaryOps.cpp +++ b/aten/src/ATen/native/BinaryOps.cpp @@ -1416,7 +1416,7 @@ Tensor& comparison_op_(Tensor& self, const Scalar& other, OutImpl& out_impl) { } // We need explicit cast to OutFunc because each *_out func is overloaded twice. Without An explicit cast, merely -// referring to *_out function is ambiguious. +// referring to *_out function is ambiguous. using OutFunc = std::add_const::type; // less, alias for torch.lt diff --git a/aten/src/ATen/native/Blas.cpp b/aten/src/ATen/native/Blas.cpp index 665602fcf18a4..88069616bf8e7 100644 --- a/aten/src/ATen/native/Blas.cpp +++ b/aten/src/ATen/native/Blas.cpp @@ -81,7 +81,7 @@ TORCH_IMPL_FUNC(addmv_out_cpu)(const Tensor &self, const Tensor &mat, const Tens if (result.numel() != 0) { NoNamesGuard guard; - if (use_mkldnn_lower_precision_matmul(mat, vec, /*result=*/Tensor())){ + if (use_mkldnn_matmul(mat, vec, /*result=*/Tensor())){ mkldnn_matmul(mat, vec, result, beta_.to(), alpha_.to()); return; } @@ -176,7 +176,7 @@ Tensor dot(const Tensor &self, const Tensor &other){ return at::_efficientzerotensor({}, self.options()); } - if (use_mkldnn_lower_precision_matmul(self, other, /*result=*/Tensor())){ + if (use_mkldnn_matmul(self, other, /*result=*/Tensor())){ // mkldnn matmul expect result have sizes info to create ideep tensor auto r = at::empty({1, 1}, self.options()); mkldnn_matmul(self, other, r, /*beta=*/0); @@ -185,7 +185,7 @@ Tensor dot(const Tensor &self, const Tensor &other){ return AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, self.scalar_type(), "dot", [&] { Tensor result = at::empty({}, self.options()); - result.fill_(dot_impl(self.numel(), self.data_ptr(), self.stride(0), other.data_ptr(), other.stride(0))); + result.fill_(dot_impl(self.numel(), const_cast(self.const_data_ptr()), self.stride(0), const_cast(other.const_data_ptr()), other.stride(0))); return result; }); } @@ -216,7 +216,7 @@ Tensor vdot(const Tensor &self, const Tensor &other){ return AT_DISPATCH_COMPLEX_TYPES(self.scalar_type(), "vdot", [&] { Tensor result = at::empty({}, self.options()); - result.fill_(vdot_impl(self.numel(), self.data_ptr(), self.stride(0), other.data_ptr(), other.stride(0))); + result.fill_(vdot_impl(self.numel(), const_cast(self.const_data_ptr()), self.stride(0), const_cast(other.const_data_ptr()), other.stride(0))); return result; }); diff --git a/aten/src/ATen/native/BlasKernel.cpp b/aten/src/ATen/native/BlasKernel.cpp index f2c61646b1b5c..48a077814880b 100644 --- a/aten/src/ATen/native/BlasKernel.cpp +++ b/aten/src/ATen/native/BlasKernel.cpp @@ -1,6 +1,8 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include #include #include +#include #include #include #include @@ -9,6 +11,10 @@ #include #include +#if defined(__aarch64__) && !defined(C10_MOBILE) +#include +#endif + namespace { /// Wrapper for const_cast with type-inference. @@ -74,24 +80,53 @@ extern "C" void sgemv_(char *trans, int *m, int *n, float *alpha, float *a, int namespace at::native { namespace blas_impl { +#if defined(__aarch64__) && !defined(C10_MOBILE) +void fp16_gemv_notrans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy); + +void fp16_gemv_trans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy); +#endif template -bool scal_use_fast_path(int64_t n, int64_t incx) { +bool scal_use_fast_path(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) { return false; } template -bool gemv_use_fast_path(int64_t m, int64_t n, int64_t lda, int64_t incx, int64_t incy) { +bool gemv_use_fast_path(C10_UNUSED int64_t m, C10_UNUSED int64_t n, + C10_UNUSED int64_t lda, C10_UNUSED int64_t incx, C10_UNUSED int64_t incy) { return false; } template -void scal_fast_path(int *n, scalar_t *a, scalar_t *x, int *incx) { +void scal_fast_path(C10_UNUSED int *n, C10_UNUSED scalar_t *a, C10_UNUSED scalar_t *x, C10_UNUSED int *incx) { TORCH_INTERNAL_ASSERT(false, "scal_fast_path shouldn't be called for this configuration"); } template -void gemv_fast_path(const char *trans, const int *m, const int *n, const scalar_t *alpha, const scalar_t *a, const int *lda, const scalar_t *x, const int *incx, const scalar_t *beta, scalar_t *y, const int *incy) { +void gemv_fast_path(C10_UNUSED const char *trans, C10_UNUSED const int *m, C10_UNUSED const int *n, + C10_UNUSED const scalar_t *alpha, C10_UNUSED const scalar_t *a, C10_UNUSED const int *lda, + C10_UNUSED const scalar_t *x, C10_UNUSED const int *incx, C10_UNUSED const scalar_t *beta, + C10_UNUSED scalar_t *y, C10_UNUSED const int *incy) { TORCH_INTERNAL_ASSERT(false, "gemv_fast_path shouldn't be called for this configuration"); } @@ -155,7 +190,248 @@ INSTANTIATE(int16_t); INSTANTIATE(int); INSTANTIATE(int64_t); INSTANTIATE(c10::BFloat16); +#if defined(__aarch64__) && !defined(C10_MOBILE) +template <> +bool scal_use_fast_path(C10_UNUSED int64_t n, C10_UNUSED int64_t incx) { + return false; +} + +template <> +bool gemv_use_fast_path( + C10_UNUSED int64_t m, + C10_UNUSED int64_t n, + C10_UNUSED int64_t lda, + C10_UNUSED int64_t incx, + C10_UNUSED int64_t incy) { + return true; +} + +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC +static inline float16_t reduce(float16x4_t x) { + auto sum = vpadd_f16(x, x); + return vget_lane_f16(vpadd_f16(sum, sum), 0); +} +static inline float16_t reduce(float16x8_t x) { + return reduce(vadd_f16(vget_low_f16(x), vget_high_f16(x))); +} + + +static void fp16_gemv_trans_fp16_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) { + parallel_for(0, n / 4, 1, [&](int begin, int end) { + for (auto i = begin * 4 ; i < end * 4; i += 4) { + float16x8_t sum0Vec = vdupq_n_f16(0); + float16x8_t sum1Vec = vdupq_n_f16(0); + float16x8_t sum2Vec = vdupq_n_f16(0); + float16x8_t sum3Vec = vdupq_n_f16(0); + const auto row0 = a + lda * (i + 0); + const auto row1 = a + lda * (i + 1); + const auto row2 = a + lda * (i + 2); + const auto row3 = a + lda * (i + 3); + for (auto j = 0; j < m; j += 8) { + float16x8_t xVec = vld1q_f16(x + j); + float16x8_t a0Vec = vld1q_f16(row0 + j); + sum0Vec = vaddq_f16(sum0Vec, vmulq_f16(a0Vec, xVec)); + float16x8_t a1Vec = vld1q_f16(row1 + j); + sum1Vec = vaddq_f16(sum1Vec, vmulq_f16(a1Vec, xVec)); + float16x8_t a2Vec = vld1q_f16(row2 + j); + sum2Vec = vaddq_f16(sum2Vec, vmulq_f16(a2Vec, xVec)); + float16x8_t a3Vec = vld1q_f16(row3 + j); + sum3Vec = vaddq_f16(sum3Vec, vmulq_f16(a3Vec, xVec)); + } + y[(i + 0) * incy] = reduce(sum0Vec); + y[(i + 1) * incy] = reduce(sum1Vec); + y[(i + 2) * incy] = reduce(sum2Vec); + y[(i + 3) * incy] = reduce(sum3Vec); + } + }); +} +#endif + +static inline float reduce(float32x4_t x) { + auto sum = vpaddq_f32(x, x); + return vgetq_lane_f32(vpaddq_f32(sum, sum), 0); +} + +static void fp16_gemv_trans_fp32_arith(const int m, const int n, const float16_t* a, const int lda, const float16_t *x, float16_t* y, int incy) { + parallel_for(0, n / 4, 1, [&](int begin, int end) { + for (auto i = begin * 4 ; i < end * 4; i += 4) { + float32x4_t sum0Vec = vdupq_n_f32(0); + float32x4_t sum1Vec = vdupq_n_f32(0); + float32x4_t sum2Vec = vdupq_n_f32(0); + float32x4_t sum3Vec = vdupq_n_f32(0); + const auto row0 = a + lda * (i + 0); + const auto row1 = a + lda * (i + 1); + const auto row2 = a + lda * (i + 2); + const auto row3 = a + lda * (i + 3); + for (auto j = 0; j < m; j += 4) { + float32x4_t xVec = vcvt_f32_f16(vld1_f16(x + j)); + float32x4_t a0Vec = vcvt_f32_f16(vld1_f16(row0 + j)); + sum0Vec = vaddq_f32(sum0Vec, vmulq_f32(a0Vec, xVec)); + float32x4_t a1Vec = vcvt_f32_f16(vld1_f16(row1 + j)); + sum1Vec = vaddq_f32(sum1Vec, vmulq_f32(a1Vec, xVec)); + float32x4_t a2Vec = vcvt_f32_f16(vld1_f16(row2 + j)); + sum2Vec = vaddq_f32(sum2Vec, vmulq_f32(a2Vec, xVec)); + float32x4_t a3Vec = vcvt_f32_f16(vld1_f16(row3 + j)); + sum3Vec = vaddq_f32(sum3Vec, vmulq_f32(a3Vec, xVec)); + } + y[(i + 0) * incy] = reduce(sum0Vec); + y[(i + 1) * incy] = reduce(sum1Vec); + y[(i + 2) * incy] = reduce(sum2Vec); + y[(i + 3) * incy] = reduce(sum3Vec); + } + }); +} + +void fp16_gemv_trans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy) { + if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && n % 4 == 0) { +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC + return at::globalContext().allowFP16ReductionCPU() && m % 8 == 0 ? fp16_gemv_trans_fp16_arith(m, n, a, lda, x, y, incy) + : fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy); +#else + return fp16_gemv_trans_fp32_arith(m, n, a, lda, x, y, incy); +#endif + } + for (const auto i : c10::irange(n)) { + float sum = 0; + const auto row_ = a + lda * i; + for (const auto j : c10::irange(m)) { + sum += x[j * incx] * row_[j]; + } + if (beta == 0.0) { + y[i * incy] = alpha * sum; + } else { + y[i * incy] = beta * y[i * incy] + alpha * sum; + } + } +} + + +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC +static void fp16_gemv_notrans_fp16_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) { + for (auto j = 0; j < n; j++) { + auto vecCol = vdup_n_f16(x[j]); + const auto* column = a + lda * j; + for (auto i = 0; i < m; i += 4) { + auto yf16 = y + i; + auto matRow = vld1_f16(column + i); + auto resVec = j != 0 ? vld1_f16(yf16) : vdup_n_f16(0); + resVec = vfma_lane_f16(resVec, matRow, vecCol, 0); + vst1_f16(yf16, resVec); + } + } +} +#endif + +static void fp16_gemv_notrans_fp32_arith(int m, int n, const float16_t* a, const int lda, const float16_t *x, float16_t *y) { + std::vector sum(m); + for (auto j = 0; j < n; j++) { + auto vecCol = vdup_n_f32(x[j]); + const auto* column = a + lda * j; + for (auto i = 0; i < m; i += 4) { + auto sf32 = sum.data() + i; + auto matRow = vcvt_f32_f16(vld1_f16(column + i)); + auto resVec = j != 0 ? vld1q_f32(sf32) : vdupq_n_f32(0); + resVec = vfmaq_lane_f32(resVec, matRow, vecCol, 0); + vst1q_f32(sf32, resVec); + } + } + + for (auto i = 0; i < m; i+= 4) { + vst1_f16(y + i, vcvt_f16_f32(vld1q_f32(sum.data() + i))); + } +} + +void fp16_gemv_notrans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy) { + if (incx == 1 && alpha == 1.0 && beta == 0.0 && m % 4 == 0 && incy == 1) { +#ifdef __ARM_FEATURE_FP16_SCALAR_ARITHMETIC + return at::globalContext().allowFP16ReductionCPU() ? fp16_gemv_notrans_fp16_arith(m, n, a, lda, x, y) + : fp16_gemv_notrans_fp32_arith(m, n, a, lda, x, y); +#else + return fp16_gemv_notrans_fp32_arith(m, n, a, lda, x, y); +#endif + } + std::vector sum(m); + for (const auto j : c10::irange(n)) { + const auto* column_ = a + lda * j; + auto z = alpha * x[j * incx]; + for (const auto i : c10::irange(m)) { + sum[i] += z * column_[i]; + } + } + if (beta == 0.0) { + for (const auto i : c10::irange(m)) { + y[i * incy] = sum[i]; + } + } else { + for (const auto i : c10::irange(m)) { + y[i * incy] += sum[i]; + } + } +} + +template <> +void gemv_fast_path( + const char* trans, + const int* m, + const int* n, + const at::Half* alpha, + const at::Half* a, + const int* lda, + const at::Half* x, + const int* incx, + const at::Half* beta, + at::Half* y, + const int* incy) { + using namespace c10::detail; + if ((trans[0] == 'T') || (trans[0] == 't')) { + fp16_gemv_trans( + *m, + *n, + fp16_from_bits(alpha->x), + reinterpret_cast(a), + *lda, + reinterpret_cast(x), + *incx, + fp16_from_bits(beta->x), + reinterpret_cast(y), + *incy); + } else { + fp16_gemv_notrans( + *m, + *n, + fp16_from_bits(alpha->x), + reinterpret_cast(a), + *lda, + reinterpret_cast(x), + *incx, + fp16_from_bits(beta->x), + reinterpret_cast(y), + *incy); + } +} +#else INSTANTIATE(c10::Half); +#endif #undef INSTANTIATE } // namespace blas_impl diff --git a/aten/src/ATen/native/Bucketization.cpp b/aten/src/ATen/native/Bucketization.cpp index 688512f2711d0..736273a40cb09 100644 --- a/aten/src/ATen/native/Bucketization.cpp +++ b/aten/src/ATen/native/Bucketization.cpp @@ -16,7 +16,7 @@ /* Implement a numpy like searchsorted and a TF like bucketize function running on cpu * - * - torch.searchsorted(sorted_sequence, values, right=False, side='left', out_int32=False, sorter=None) + * - torch.searchsorted(sorted_sequence, values, right=False, side=None, out_int32=False, sorter=None) * sorted_sequence - N*D or 1D (apply to all values) tensor containing sorted sequences in last dimension * values - N*D tensor or a Scalar (when sorted_sequence is 1D) containing the search values * right - corresponding to lower bound if False and upper bound if True @@ -92,9 +92,9 @@ void searchsorted_cpu_contiguous(Tensor& result, const Tensor& input, const Tens int64_t idim_in = is_scalar_input ? 1 : input.sizes().back(); int64_t idim_bd = boundaries.sizes().back(); - const input_t *data_in = input.data_ptr(); - const input_t *data_bd = boundaries.data_ptr(); - const int64_t *data_st = sorter.defined() ? sorter.data_ptr() : nullptr; + const input_t *data_in = input.const_data_ptr(); + const input_t *data_bd = boundaries.const_data_ptr(); + const int64_t *data_st = sorter.defined() ? sorter.const_data_ptr() : nullptr; output_t *data_out = result.data_ptr(); bool is_1d_boundaries = boundaries.dim() == 1; @@ -162,7 +162,7 @@ Tensor& searchsorted_out_cpu( return result; } - // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor + // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor Tensor out = result; if (!result.is_contiguous()) { out = result.contiguous(); diff --git a/aten/src/ATen/native/CPUBlas.cpp b/aten/src/ATen/native/CPUBlas.cpp index f0e5f333df80a..ac49364573c48 100644 --- a/aten/src/ATen/native/CPUBlas.cpp +++ b/aten/src/ATen/native/CPUBlas.cpp @@ -164,6 +164,11 @@ void gemm( const float beta, float *c, int64_t ldc) { internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); +#if AT_MKLDNN_ENABLED() + if (mkldnn_bf32_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc)) { + return; + } +#endif #if AT_BUILD_WITH_BLAS() if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) { int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; @@ -394,6 +399,42 @@ void gemm( } } +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + const float beta, + float *c, int64_t ldc) { + internal::normalize_last_dims(transa, transb, m, n, k, &lda, &ldb, &ldc); +#ifdef MKL_HAS_SHGEMM + if (use_blas_gemm(transa, transb, m, n, k, lda, ldb, ldc)) { + int m_ = m, n_ = n, k_ = k, lda_ = lda, ldb_ = ldb, ldc_ = ldc; + mkl_gemm_f16f16f32(transa, transb, m_, n_, k_, alpha, a, lda_, b, ldb_, beta, c, ldc_); + return; + } +#endif + // for the fallback path, first compute gemm with beta = 0, + // and then add c in full precision. + int64_t c_size = n * m; + std::vector float16_c(c_size, 0.f); + gemm_stub( + at::kCPU, at::kHalf, + transa, transb, m, n, k, alpha, a, lda, b, ldb, 0.f, float16_c.data(), m); + for (const auto j : c10::irange(n)) { + for (const auto i : c10::irange(m)) { + auto offset = j * ldc + i; + // beta == 0 won't propagate NaN from C + if (beta == 0.f) { + c[offset] = c10::convert(float16_c[j * m + i]); + } else { + c[offset] = beta * c[offset] + c10::convert(float16_c[j * m + i]); + } + } + } +} + void gemm( TransposeType transa, TransposeType transb, int64_t m, int64_t n, int64_t k, diff --git a/aten/src/ATen/native/CPUBlas.h b/aten/src/ATen/native/CPUBlas.h index 8c9075a06780e..3b30df1c21fad 100644 --- a/aten/src/ATen/native/CPUBlas.h +++ b/aten/src/ATen/native/CPUBlas.h @@ -88,6 +88,15 @@ void gemm( float beta, at::Half *c, int64_t ldc); +void gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + const float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + const float beta, + float *c, int64_t ldc); + void gemm( TransposeType transa, TransposeType transb, int64_t m, int64_t n, int64_t k, diff --git a/aten/src/ATen/native/CPUFallback.cpp b/aten/src/ATen/native/CPUFallback.cpp index b8d9d3b9347d9..502c61e4d144c 100644 --- a/aten/src/ATen/native/CPUFallback.cpp +++ b/aten/src/ATen/native/CPUFallback.cpp @@ -89,6 +89,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool std::vector> tensorlist_args; std::vector tensorlist_args_indices; + c10::optional tgt_device = c10::nullopt; // save converted cpu tensor for TensorList std::vector tensorlist_cpu_args; @@ -108,6 +109,25 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool auto cpu_ivalue = c10::IValue(c10::List(to_cpu(ivalue.toTensorList().vec()))); tensorlist_cpu_args.push_back(cpu_ivalue); (*stack)[arguments_begin + idx] = std::move(cpu_ivalue); + tensorlist_args.push_back(ivalue.toTensorList()); + } else if (ivalue.isOptionalTensorList()) { + auto opt_tensors = ivalue.toOptionalTensorList().vec(); + std::vector need_convert_tensors; + std::vector need_convert_tensors_index; + for (auto i : c10::irange(opt_tensors.size())) { + if (!opt_tensors[i].has_value() || !opt_tensors[i]->defined()) continue; + need_convert_tensors.push_back(opt_tensors[i].value()); + need_convert_tensors_index.push_back(i); + } + auto cpu_tensors = to_cpu(need_convert_tensors); + for (const auto i : c10::irange(need_convert_tensors_index.size())) { + auto idx = need_convert_tensors_index[i]; + opt_tensors[idx] = cpu_tensors[i]; + } + (*stack)[arguments_begin + idx] = c10::IValue(opt_tensors); + } else if (ivalue.isDevice()) { + tgt_device = ivalue.toDevice(); + (*stack)[arguments_begin + idx] = c10::IValue(c10::Device(kCPU)); } } // XLA requires all of the tensor arguments to be gathered up and converted to CPU together. @@ -151,7 +171,7 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool // the temporary CPU output tensor that we created. // // Note [CPU Fallback Does Not Handle View Operators] - // Also note that we are incapable of handling immutable alises properly. + // Also note that we are incapable of handling immutable aliases properly. // Why? // Schemas with an immutable alias'd tensor outputs correspond to view operators. // For example, the `view_as` schema from native_functions.yaml: @@ -168,8 +188,9 @@ void cpu_fallback(const c10::OperatorHandle& op, torch::jit::Stack* stack, bool auto returns = torch::jit::last(stack, num_returns); const auto returns_begin = stack->size() - num_returns; - c10::optional tgt_device = - compute_target_device(tensor_args, tensorlist_args); + if (tgt_device == c10::nullopt) { + tgt_device = compute_target_device(tensor_args, tensorlist_args); + } for (const auto idx : c10::irange(returns.size())) { const AliasInfo* alias_info = schema_returns[idx].alias_info(); diff --git a/aten/src/ATen/native/ChanelShuffle.cpp b/aten/src/ATen/native/ChanelShuffle.cpp index 2d9ca7cb459d7..be57917967fa9 100644 --- a/aten/src/ATen/native/ChanelShuffle.cpp +++ b/aten/src/ATen/native/ChanelShuffle.cpp @@ -20,11 +20,16 @@ namespace at::native { Tensor channel_shuffle_cpu(const Tensor& self, int64_t groups) { - auto memory_format = self.suggest_memory_format(); - auto output = at::empty({0}, self.options()); - output.resize_(self.sizes(), memory_format); - auto input = self.contiguous(memory_format); - channel_shuffle_kernel(kCPU, output, input, groups); + Tensor output; + if (self.numel() == 0) { + output = self.alias(); + } else { + auto memory_format = self.suggest_memory_format(); + output = at::empty({0}, self.options()); + output.resize_(self.sizes(), memory_format); + auto input = self.contiguous(memory_format); + channel_shuffle_kernel(kCPU, output, input, groups); + } return namedinference::propagate_names_if_nonempty( output, self.has_names() ? self.names() : at::ArrayRef{}); @@ -69,7 +74,7 @@ Tensor math_channel_shuffle(const Tensor& self, int64_t groups) { // It is not clear, however from initial looking around it feels that // this may not be correct. // In this case channels last will likely require custom implementation - // if we want to preseve the memory order. + // if we want to preserve the memory order. // XNNPACK has channel shuffle op for NHWC. For mobile usecase this is good. // For server we will have to do a custom implementation. // For ChannelsFirst, a.k.a Contiguous, memory format we will also need diff --git a/aten/src/ATen/native/CompositeRandomAccessorCommon.h b/aten/src/ATen/native/CompositeRandomAccessorCommon.h index 919647992ccab..9111c3515afce 100644 --- a/aten/src/ATen/native/CompositeRandomAccessorCommon.h +++ b/aten/src/ATen/native/CompositeRandomAccessorCommon.h @@ -118,7 +118,7 @@ class CompositeRandomAccessor { using value_type = composite_value_type; using reference = references_holder; // Note that CompositeRandomAccessor does not hold key and values - // in a specific datastrcture, which means that a pointer to a (key, value) + // in a specific datastructure, which means that a pointer to a (key, value) // is not defined. Hence we just use a pointer type of the KeyAccessor. using pointer = typename std::iterator_traits::pointer; using difference_type = typename std::iterator_traits::difference_type; diff --git a/aten/src/ATen/native/Constraints.cpp b/aten/src/ATen/native/Constraints.cpp index 9b7703313528d..8f3f8c11e696c 100644 --- a/aten/src/ATen/native/Constraints.cpp +++ b/aten/src/ATen/native/Constraints.cpp @@ -29,7 +29,7 @@ void sym_constrain_range( int64_t min_val = min.has_value() ? min.value() : std::numeric_limits::min(); int64_t max_val = max.has_value() ? max.value() : std::numeric_limits::max(); - int64_t size_as_int = size.toInt(); + int64_t size_as_int = size.toLong(); TORCH_CHECK( max_val >= min_val, diff --git a/aten/src/ATen/native/ConvUtils.h b/aten/src/ATen/native/ConvUtils.h index 5d2691b9761ee..4b814f3e442cb 100644 --- a/aten/src/ATen/native/ConvUtils.h +++ b/aten/src/ATen/native/ConvUtils.h @@ -358,7 +358,6 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const } bool can_use_miopen_channels_last_2d = false; -#if defined(USE_ROCM) && (ROCM_VERSION >= 40300) // TODO: Remove PYTORCH_MIOPEN_SUGGEST_NHWC once ROCm officially supports NHWC in MIOpen // See #64427 static c10::optional PYTORCH_MIOPEN_SUGGEST_NHWC = c10::utils::check_env("PYTORCH_MIOPEN_SUGGEST_NHWC"); @@ -370,7 +369,6 @@ static inline bool miopen_conv_use_channels_last(const at::Tensor& input, const ( (input_memory_format == at::MemoryFormat::ChannelsLast) || (weight_memory_format == at::MemoryFormat::ChannelsLast) ) ); -#endif bool can_use_miopen_channels_last_3d = false; diff --git a/aten/src/ATen/native/Convolution.cpp b/aten/src/ATen/native/Convolution.cpp index e76128db847a0..717280a6cdcab 100644 --- a/aten/src/ATen/native/Convolution.cpp +++ b/aten/src/ATen/native/Convolution.cpp @@ -848,9 +848,8 @@ at::Tensor complex_convolution( SymIntArrayRef output_padding, c10::SymInt groups) { check_input_same_type_as_parameters(input, weight, bias); - Tensor i_r, i_i, w_r, w_i; - std::tie(i_r, i_i) = complex_to_real(input.resolve_conj()); - std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj()); + auto [i_r, i_i] = complex_to_real(input.resolve_conj()); + auto [w_r, w_i] = complex_to_real(weight.resolve_conj()); // [NOTE] Complex Convolution // conv(W, x, b) = conv(Wr, xr, br) - conv(Wi, xi, 0) + i(conv(Wi, xr, bi) + conv(Wr, xi, 0)) @@ -866,8 +865,7 @@ at::Tensor complex_convolution( b = at::convolution_symint(i_i, w_i, bias, stride, padding, dilation, transposed, output_padding, groups); c = at::convolution_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, transposed, output_padding, groups); } else { - Tensor b_r, b_i; - std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj()); + auto [b_r, b_i] = complex_to_real(bias.resolve_conj()); a = at::convolution_symint(i_r, w_r, b_r, stride, padding, dilation, transposed, output_padding, groups); b = at::convolution_symint(i_i, w_i, Tensor(), stride, padding, dilation, transposed, output_padding, groups); c = at::convolution_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, transposed, output_padding, groups); @@ -887,9 +885,8 @@ at::Tensor complex_convolution_mode( c10::SymInt groups) { auto bias = bias_opt.value_or(Tensor()); check_input_same_type_as_parameters(input, weight, bias); - Tensor i_r, i_i, w_r, w_i; - std::tie(i_r, i_i) = complex_to_real(input.resolve_conj()); - std::tie(w_r, w_i) = complex_to_real(weight.resolve_conj()); + auto [i_r, i_i] = complex_to_real(input.resolve_conj()); + auto [w_r, w_i] = complex_to_real(weight.resolve_conj()); // See [NOTE] Complex Convolution Tensor a, b, c; @@ -898,8 +895,7 @@ at::Tensor complex_convolution_mode( b = at::_convolution_mode_symint(i_i, w_i, bias, stride, padding, dilation, groups); c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, bias, stride, padding, dilation, groups); } else { - Tensor b_r, b_i; - std::tie(b_r, b_i) = complex_to_real(bias.resolve_conj()); + auto [b_r, b_i] = complex_to_real(bias.resolve_conj()); a = at::_convolution_mode_symint(i_r, w_r, b_r, stride, padding, dilation, groups); b = at::_convolution_mode_symint(i_i, w_i, Tensor(), stride, padding, dilation, groups); c = at::_convolution_mode_symint(i_r + i_i, w_r + w_i, b_r + b_i, stride, padding, dilation, groups); @@ -926,9 +922,7 @@ at::Tensor conv1d_symint( bias.dtype().name(), ") should be the same"); - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {0}, groups); @@ -953,9 +947,7 @@ at::Tensor conv2d_symint( bias.dtype().name(), ") should be the same"); - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0}}, groups); @@ -980,9 +972,7 @@ at::Tensor conv3d_symint( bias.dtype().name(), ") should be the same"); - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution(input, weight, bias, stride, padding, dilation, false, {{0, 0, 0}}, groups); @@ -1080,9 +1070,7 @@ at::Tensor conv1d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv1d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); @@ -1096,9 +1084,7 @@ at::Tensor conv2d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv2d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); @@ -1112,9 +1098,7 @@ at::Tensor conv3d_padding_symint( const Tensor& input_, const Tensor& weight, const c10::optional& bias, c10::SymIntArrayRef stride, c10::string_view padding, c10::SymIntArrayRef dilation, c10::SymInt groups) { - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv3d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution_mode(input, weight, bias, stride, std::move(padding), dilation, groups); @@ -1131,9 +1115,7 @@ at::Tensor conv_transpose1d_symint( c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 1, "conv_transpose1d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 1, "conv_transpose1d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution( @@ -1152,9 +1134,7 @@ at::Tensor conv_transpose2d_symint( c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 2, "conv_transpose2d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution( @@ -1173,9 +1153,7 @@ at::Tensor conv_transpose3d_symint( c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; - Tensor input; - bool is_batched; - std::tie(input, is_batched) = batchify(input_, /*num_spatial_dims=*/ 3, "conv_transpose3d"); + auto [input, is_batched] = batchify(input_, /*num_spatial_dims=*/ 3, "conv_transpose3d"); Tensor output; if (at::isComplexType(input_.scalar_type())) { output = complex_convolution( @@ -1420,8 +1398,8 @@ static inline std::vector calc_output_size( conv_output_size(input.sizes(), weight.sizes(), params.padding, params.stride, params.dilation); // Handle empty # of channels. - if (input.size(1) == 0) { - output_size[input_channels_dim] = 0; + if (input.size(input_channels_dim) == 0) { + output_size[output_channels_dim] = 0; } return output_size; } diff --git a/aten/src/ATen/native/ConvolutionMM2d.cpp b/aten/src/ATen/native/ConvolutionMM2d.cpp index 7b0a7300459f6..6f8a3477c239c 100644 --- a/aten/src/ATen/native/ConvolutionMM2d.cpp +++ b/aten/src/ATen/native/ConvolutionMM2d.cpp @@ -61,7 +61,7 @@ static Tensor compute_columns2d( kernel_height * kernel_width * n_input_plane : output_height * output_width; columns = at::empty({batch_size, row, col}, input.options()); AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{ - auto input_a = input.accessor(); + auto input_a = input.accessor(); auto columns_a = columns.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { @@ -220,9 +220,9 @@ static inline Tensor view_weight_2d(const Tensor& weight_, template static void slow_conv2d_update_output_frame( - TensorAccessor input, + TensorAccessor input, TensorAccessor output, - TensorAccessor weight, + TensorAccessor weight, bool has_bias, TensorAccessor finput, int64_t kernel_height, @@ -285,8 +285,8 @@ static void slow_conv2d_update_output_frame( template void slow_conv2d_backward_update_grad_input_frame( TensorAccessor grad_input, - TensorAccessor grad_output, - TensorAccessor weight, + TensorAccessor grad_output, + TensorAccessor weight, scalar_t *fgrad_input, int64_t kernel_height, int64_t kernel_width, @@ -405,9 +405,9 @@ void slow_conv2d_backward_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu_grad_input", [&] { - auto grad_output_a = grad_output.accessor(); + auto grad_output_a = grad_output.accessor(); auto grad_input_a = grad_input.accessor(); - auto weight_a = weight.accessor(); + auto weight_a = weight.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { auto fgrad_input = std::make_unique(fgrad_input_size); @@ -434,8 +434,8 @@ void slow_conv2d_backward_out_cpu_template( template void slow_conv2d_backward_weight_frame( TensorAccessor grad_weight, - TensorAccessor grad_output, - TensorAccessor finput, + TensorAccessor grad_output, + TensorAccessor finput, bool is_channels_last) { // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T // Note gemm expects fortran order, so all 3 matrices are transposed. @@ -519,9 +519,9 @@ static void slow_conv2d_backward_weight_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu_grad_weight", [&] { - auto grad_output_a = grad_output.accessor(); + auto grad_output_a = grad_output.accessor(); auto grad_weight_2d_a = grad_weight_2d.accessor(); - auto finput_a = finput.accessor(); + auto finput_a = finput.accessor(); for (const auto t : c10::irange(batch_size)) { auto grad_output_t = grad_output_a[t]; @@ -588,10 +588,10 @@ Tensor& slow_conv2d_forward_out_cpu( TORCH_CHECK(output.is_contiguous(memory_format), "slow_conv2d output tensor must be contiguous"); AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv2d_cpu", [&]{ - auto input_a = input.accessor(); + auto input_a = input.accessor(); auto output_a = output.accessor(); auto finput_a = finput.accessor(); - auto weight_2d_a = weight_2d.accessor(); + auto weight_2d_a = weight_2d.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { for (const auto t : c10::irange(start, end)) { diff --git a/aten/src/ATen/native/ConvolutionMM3d.cpp b/aten/src/ATen/native/ConvolutionMM3d.cpp index c194721acd491..1d5e7a8333def 100644 --- a/aten/src/ATen/native/ConvolutionMM3d.cpp +++ b/aten/src/ATen/native/ConvolutionMM3d.cpp @@ -72,7 +72,7 @@ static Tensor compute_columns3d( input.options()); AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "compute_columns3d", [&] { - auto input_a = input.accessor(); + auto input_a = input.accessor(); auto columns_a = columns.accessor(); at::parallel_for(0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) { @@ -261,11 +261,11 @@ static Tensor view_weight_2d(const Tensor& weight_) { template static void slow_conv3d_update_output_frame( - TensorAccessor input, + TensorAccessor input, TensorAccessor output, - TensorAccessor weight, + TensorAccessor weight, bool has_bias, - TensorAccessor finput, + TensorAccessor finput, int64_t kernel_depth, int64_t kernel_height, int64_t kernel_width, @@ -311,8 +311,8 @@ static void slow_conv3d_update_output_frame( template void slow_conv3d_backward_update_grad_input_frame( TensorAccessor grad_input, - TensorAccessor grad_output, - TensorAccessor weight, + TensorAccessor grad_output, + TensorAccessor weight, TensorAccessor fgrad_input, int64_t kernel_depth, int64_t kernel_height, @@ -430,12 +430,12 @@ void slow_conv3d_backward_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu_grad_input", [&] { + auto grad_input_a = grad_input.accessor(); + auto grad_output_a = grad_output_contiguous.accessor(); + auto fgrad_input_a = fgrad_input.accessor(); + auto weight_2d_a = weight2d.accessor(); at::parallel_for(0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) { - auto grad_input_a = grad_input.accessor(); - auto grad_output_a = grad_output_contiguous.accessor(); - auto fgrad_input_a = fgrad_input.accessor(); - auto weight_2d_a = weight2d.accessor(); for (const auto t : c10::irange(start, end)) { auto grad_input_t = grad_input_a[t]; @@ -464,8 +464,8 @@ void slow_conv3d_backward_out_cpu_template( template void slow_conv3d_backward_weight_frame( TensorAccessor grad_weight, - TensorAccessor grad_output, - TensorAccessor finput, + TensorAccessor grad_output, + TensorAccessor finput, int64_t groups) { // Compute grad_weight += grad_output.reshape({grad_output.shape(0), -1}) * finput.T // Note gemm expects fortran order, so all 3 matrices are transposed. @@ -538,8 +538,8 @@ static void slow_conv3d_backward_parameters_out_cpu_template( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu_grad_weight", [&] { auto grad_weight_2d_a = grad_weight_2d.accessor(); - auto grad_output_a = grad_output_contiguous.accessor(); - auto finput_a = finput.accessor(); + auto grad_output_a = grad_output_contiguous.accessor(); + auto finput_a = finput.accessor(); for (const auto t : c10::irange(batch_size)) { auto grad_output_t = grad_output_a[t]; auto finput_t = finput_a[t]; @@ -623,10 +623,10 @@ Tensor& slow_conv3d_forward_out_cpu(const Tensor& self, TORCH_CHECK(output.is_contiguous(), "slow_conv3d output must be contiguous"); AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "slow_conv3d_cpu", [&] { - auto input_a = input.accessor(); + auto input_a = input.accessor(); auto output_a = output.accessor(); - auto finput_a = finput.accessor(); - auto weight_2d_a = weight_2d.accessor(); + auto finput_a = finput.accessor(); + auto weight_2d_a = weight_2d.accessor(); at::parallel_for( 0, batch_size, CONV3D_GRAIN_SALT, [&](int64_t start, int64_t end) { diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index bdd07ea01ff47..eaaa394036866 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -81,7 +81,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { TORCH_INTERNAL_ASSERT_DEBUG_ONLY(self.sizes().equals(src.sizes())); _AT_DISPATCH_CP_TYPES(self.scalar_type(), "copy_", [&] { - scalar_t* sp = src.data_ptr(); + const scalar_t* sp = src.const_data_ptr(); scalar_t* rp = self.data_ptr(); scalar_t* bp = buf.data_ptr(); @@ -89,7 +89,7 @@ void copy_same_type_transpose_(Tensor& self, const Tensor& src) { int64_t NC = src.size(1); for (int64_t R = 0; R < NR; R += BLOCK_SZ) { for (int64_t C = 0; C < NC; C += BLOCK_SZ) { - scalar_t* spo = sp + R + C * NR; + const scalar_t* spo = sp + R + C * NR; scalar_t* rpo = rp + C + R * NC; int nr = std::min(NR - R, BLOCK_SZ); @@ -156,7 +156,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) auto* output_ptr = reinterpret_cast(self.data_ptr()); if (self.numel() < at::internal::GRAIN_SIZE) { - fbgemm::FloatToFloat16_simd(src.data_ptr(), output_ptr, self.numel()); + fbgemm::FloatToFloat16_simd(src.const_data_ptr(), output_ptr, self.numel()); } else { at::parallel_for( 0, @@ -164,14 +164,14 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) at::internal::GRAIN_SIZE, [&](int64_t begin, int64_t end) { fbgemm::FloatToFloat16_simd( - src.data_ptr() + begin, + src.const_data_ptr() + begin, output_ptr + begin, end - begin); }); } } else { - auto in_data = reinterpret_cast( - src.data_ptr()); + auto in_data = reinterpret_cast( + src.const_data_ptr()); auto* output_ptr = self.data_ptr(); if (self.numel() < at::internal::GRAIN_SIZE) { fbgemm::Float16ToFloat_simd(in_data, output_ptr, self.numel()); @@ -265,7 +265,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) auto iter = TensorIteratorConfig() .add_output(self) - .add_input(src) + .add_const_input(src) .resize_outputs(false) .check_all_same_dtype(false) .check_all_same_device(false) @@ -296,7 +296,7 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) } #endif - if(!self.is_complex() && src.is_complex()) { + if(!(self.is_complex() || self.dtype() == at::kBool) && src.is_complex()) { TORCH_WARN_ONCE("Casting complex values to real discards the imaginary part"); } copy_stub(device_type, iter, non_blocking); @@ -335,7 +335,7 @@ void copy_ignoring_overlaps(const TensorBase &dst, const TensorBase &src) { // FIXME: really, overlapping writes should be illegal/an error in Torch auto iter = TensorIteratorConfig() .add_output(dst) - .add_input(src) + .add_const_input(src) .resize_outputs(false) .set_check_mem_overlap(false) .check_all_same_dtype(true) diff --git a/aten/src/ATen/native/DispatchStub.cpp b/aten/src/ATen/native/DispatchStub.cpp index c7db94889c58d..93c004acdc17c 100644 --- a/aten/src/ATen/native/DispatchStub.cpp +++ b/aten/src/ATen/native/DispatchStub.cpp @@ -10,8 +10,19 @@ #include #include +#ifdef HAVE_ZVECTOR_CPU_DEFINITION +#include +#endif + namespace at::native { +#ifdef HAVE_ZVECTOR_CPU_DEFINITION +static inline bool cpu_has_vxe() +{ + return (getauxval(AT_HWCAP) & HWCAP_S390_VXE); +} +#endif + static CPUCapability compute_cpu_capability() { auto envar = std::getenv("ATEN_CPU_CAPABILITY"); if (envar) { @@ -60,10 +71,16 @@ static CPUCapability compute_cpu_capability() { #endif } #endif + +#ifdef HAVE_ZVECTOR_CPU_DEFINITION + // vxe is needed for fp32 vector instructions + if (cpu_has_vxe()) { + return CPUCapability::ZVECTOR; + } +#endif + #ifdef HAVE_VSX_CPU_DEFINITION return CPUCapability::VSX; -#elif HAVE_ZVECTOR_CPU_DEFINITION - return CPUCapability::ZVECTOR; #else return CPUCapability::DEFAULT; #endif diff --git a/aten/src/ATen/native/DispatchStub.h b/aten/src/ATen/native/DispatchStub.h index a7df275edf1de..1b3e29a963f18 100644 --- a/aten/src/ATen/native/DispatchStub.h +++ b/aten/src/ATen/native/DispatchStub.h @@ -227,18 +227,18 @@ struct RegisterPRIVATEUSE1Dispatch { // adding parentheses and using helper struct to get rid of the parentheses, do // not work with MSVC. So do a `using`-declaration if you need to pass in such // `fn`, e.g., grid_sampler_2d_backward_cpu_kernel in GridSampleKernel.h. -#define DECLARE_DISPATCH(fn, name) \ - struct name : DispatchStub { \ - name() = default; \ - name(const name&) = delete; \ - name& operator=(const name&) = delete; \ - }; \ - extern TORCH_API struct name name +#define DECLARE_DISPATCH(fn, name) \ + struct name##_DECLARE_DISPATCH_type : DispatchStub { \ + name##_DECLARE_DISPATCH_type() = default; \ + name##_DECLARE_DISPATCH_type(const name##_DECLARE_DISPATCH_type&) = delete; \ + name##_DECLARE_DISPATCH_type& operator=(const name##_DECLARE_DISPATCH_type&) = delete; \ + }; \ + extern TORCH_API struct name##_DECLARE_DISPATCH_type name; -#define DEFINE_DISPATCH(name) struct name name +#define DEFINE_DISPATCH(name) struct name##_DECLARE_DISPATCH_type name #define REGISTER_ARCH_DISPATCH(name, arch, fn) \ - template <> name::FnPtr TORCH_API DispatchStub::arch = fn; + template <> name##_DECLARE_DISPATCH_type::FnPtr TORCH_API DispatchStub::arch = fn; #ifdef HAVE_AVX512_CPU_DEFINITION #define REGISTER_AVX512_DISPATCH(name, fn) REGISTER_ARCH_DISPATCH(name, AVX512, fn) @@ -277,16 +277,16 @@ struct RegisterPRIVATEUSE1Dispatch { REGISTER_ALL_CPU_DISPATCH(name, nullptr) #define REGISTER_CUDA_DISPATCH(name, fn) \ - static RegisterCUDADispatch name ## __register(name, fn); + static RegisterCUDADispatch name ## __register(name, fn); #define REGISTER_HIP_DISPATCH(name, fn) \ - static RegisterHIPDispatch name ## __register(name, fn); + static RegisterHIPDispatch name ## __register(name, fn); #define REGISTER_MPS_DISPATCH(name, fn) \ - static RegisterMPSDispatch name ## __register(name, fn); + static RegisterMPSDispatch name ## __register(name, fn); #define REGISTER_PRIVATEUSE1_DISPATCH(name, fn) \ - static RegisterPRIVATEUSE1Dispatch name ## __register(name, fn); + static RegisterPRIVATEUSE1Dispatch name ## __register(name, fn); // NB: This macro must be used in an actual 'cu' file; if you try using // it from a 'cpp' file it will not work! diff --git a/aten/src/ATen/native/Distance.cpp b/aten/src/ATen/native/Distance.cpp index 4b33b713a5b6c..5af87802a1246 100644 --- a/aten/src/ATen/native/Distance.cpp +++ b/aten/src/ATen/native/Distance.cpp @@ -310,14 +310,13 @@ Tensor cosine_similarity(const Tensor& x1_, const Tensor& x2_, int64_t dim, doub auto x2_is_int = c10::isIntegralType(x2_.scalar_type(), /*Ć®ncludeBool=*/true); auto x1_t = x1_is_int ? x1_.to(commonDtype) : x1_; auto x2_t = x2_is_int ? x2_.to(commonDtype) : x2_; - c10::MaybeOwned x1, x2; - std::tie(x1, x2) = expand_outplace(x1_t, x2_t); + auto [x1, x2] = expand_outplace(x1_t, x2_t); // We want to divide each tensor by its norm first, as it's more numerically stable. // This keeps the result between -1.0 and 1.0 // We clone them, as we're going to modify them in-place - // This allows the gradients to propagate propertly all the way to x1 and x2 + // This allows the gradients to propagate properly all the way to x1 and x2 auto x1_norm = at::linalg_vector_norm(*x1, 2, /*dim=*/dim, /*keepdim=*/true).clone(); auto x2_norm = at::linalg_vector_norm(*x2, 2, /*dim=*/dim, /*keepdim=*/true).clone(); diff --git a/aten/src/ATen/native/Distributions.cpp b/aten/src/ATen/native/Distributions.cpp index c183beab157ff..4d4eb2efaf401 100644 --- a/aten/src/ATen/native/Distributions.cpp +++ b/aten/src/ATen/native/Distributions.cpp @@ -4,7 +4,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/Dropout.cpp b/aten/src/ATen/native/Dropout.cpp index 47da7c1bf8cc2..7014ec65d1f5a 100644 --- a/aten/src/ATen/native/Dropout.cpp +++ b/aten/src/ATen/native/Dropout.cpp @@ -99,7 +99,7 @@ ALIAS_SPECIALIZATION(_feature_dropout, true, false) ALIAS_SPECIALIZATION(_alpha_dropout, false, true ) ALIAS_SPECIALIZATION(_feature_alpha_dropout, true, true ) -} // anomymous namepsace +} // anonymous namespace std::tuple native_dropout_cpu(const Tensor& input, double p, c10::optional train) { diff --git a/aten/src/ATen/native/Embedding.cpp b/aten/src/ATen/native/Embedding.cpp index 07253c5ed566d..705b08ab39f06 100644 --- a/aten/src/ATen/native/Embedding.cpp +++ b/aten/src/ATen/native/Embedding.cpp @@ -124,18 +124,18 @@ Tensor embedding_dense_backward_cpu( auto add_iter = TensorIteratorConfig() .add_output(grad_weight) .add_input(grad_weight) - .add_input(grad) + .add_const_input(grad) .resize_outputs(false) .declare_static_shape(grad.sizes(), /*squash_dims=*/0) .build(); const auto gW_data = reinterpret_cast(grad_weight.data_ptr()); - const auto gO_data = reinterpret_cast(grad.data_ptr()); + const auto gO_data = reinterpret_cast(grad.const_data_ptr()); const auto gW_stride = grad_weight.strides()[0] * grad_weight.element_size(); const auto gO_stride = grad.strides()[0] * grad.element_size(); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_dense_backward_cpu", [&] () { - auto indices_data = indices_contig.data_ptr(); + auto indices_data = indices_contig.const_data_ptr(); // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) std::unique_ptr counts; @@ -164,7 +164,7 @@ Tensor embedding_dense_backward_cpu( // grad_weight[k].add_(grad[i], scale); iter.unsafe_replace_operand(0, gW_data + k * gW_stride); iter.unsafe_replace_operand(1, gW_data + k * gW_stride); - iter.unsafe_replace_operand(2, gO_data + i * gO_stride); + iter.unsafe_replace_operand(2, const_cast(gO_data + i * gO_stride)); add_stub(kCPU, iter, scale); } } @@ -189,7 +189,7 @@ Tensor & embedding_renorm_cpu_( auto num_indices = indices.numel(); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_renorm_cpu_", [&]() { - auto data_ptr = indices_contig.data_ptr(); + auto data_ptr = indices_contig.const_data_ptr(); auto sorted_indices = std::vector(data_ptr, data_ptr + num_indices); std::sort(sorted_indices.begin(), sorted_indices.end()); diff --git a/aten/src/ATen/native/EmbeddingBag.cpp b/aten/src/ATen/native/EmbeddingBag.cpp index 0062e6b0804b6..8b6c90dae2375 100644 --- a/aten/src/ATen/native/EmbeddingBag.cpp +++ b/aten/src/ATen/native/EmbeddingBag.cpp @@ -70,16 +70,16 @@ static void make_offset2bag(const Tensor &offsets, Tensor& offset2bag) { namespace { -std::pair promoteIndicesAndOffsets( +std::pair, c10::MaybeOwned> promoteIndicesAndOffsets( const Tensor& indices, const Tensor& offsets) { const auto commonType = promoteTypes(offsets.scalar_type(), indices.scalar_type()); return { - indices.scalar_type() == commonType ? indices - : indices.toType(commonType), - offsets.scalar_type() == commonType ? offsets - : offsets.toType(commonType)}; + indices.scalar_type() == commonType ? c10::MaybeOwned::borrowed(indices) + : c10::MaybeOwned::owned(indices.toType(commonType)), + offsets.scalar_type() == commonType ? c10::MaybeOwned::borrowed(offsets) + : c10::MaybeOwned::owned(offsets.toType(commonType))}; } // Determines if we can use a fast implementation for index_select_add, which @@ -125,9 +125,9 @@ index_select_add( index_t padding_idx, _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { TORCH_CHECK(select_indices.numel() == add_indices.numel()); - auto* add_indices_data = add_indices.data_ptr(); - auto* select_indices_data = select_indices.data_ptr(); - auto* src_data = src.data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); + auto* src_data = src.const_data_ptr(); auto* output_data = output.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; @@ -208,14 +208,14 @@ index_select_add( index_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { int64_t ddim = src.size(1); - auto* select_indices_data = select_indices.data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); auto* output_data = output.data_ptr(); if (is_fast_path_index_select(src, output, padding_idx)) { auto src_contig = src.contiguous(); - auto* src_data = src_contig.data_ptr(); + auto* src_data = src_contig.const_data_ptr(); int64_t output_size = offsets.numel() - 1; - auto* offsets_data = offsets.data_ptr(); + auto* offsets_data = offsets.const_data_ptr(); std::vector offsets_include_last; if (include_last_offset) { @@ -316,8 +316,8 @@ index_select_add( #endif } else { TORCH_CHECK(select_indices.numel() == add_indices.numel()); - auto* src_data = src.data_ptr(); - auto* add_indices_data = add_indices.data_ptr(); + auto* src_data = src.const_data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; if (bag_size.defined()) { @@ -388,14 +388,14 @@ index_select_add(const Tensor &select_indices, index_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { int64_t ddim = src.size(1); - auto* select_indices_data = select_indices.data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); auto* output_data = output.data_ptr(); if (is_fast_path_index_select(src, output, padding_idx)) { auto src_contig = src.contiguous(); - auto* src_data = src_contig.data_ptr(); + auto* src_data = src_contig.const_data_ptr(); int64_t output_size = offsets.numel() - 1; - auto* offsets_data = offsets.data_ptr(); + auto* offsets_data = offsets.const_data_ptr(); std::vector offsets_include_last; if (include_last_offset) { @@ -463,8 +463,8 @@ index_select_add(const Tensor &select_indices, }); } else { AT_ASSERT(select_indices.numel() == add_indices.numel()); - auto* src_data = src.data_ptr(); - auto* add_indices_data = add_indices.data_ptr(); + auto* src_data = src.const_data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; if (bag_size.defined()) { @@ -519,9 +519,9 @@ index_select_scale_add( index_t padding_idx, _EmbeddingBagKernelCache* /* fbgemm_kernel_cache */) { AT_ASSERT(select_indices.numel() == add_indices.numel()); - auto* add_indices_data = add_indices.data_ptr(); - auto* select_indices_data = select_indices.data_ptr(); - auto* src_data = src.data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); + auto* src_data = src.const_data_ptr(); auto* output_data = output.data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; @@ -536,7 +536,7 @@ index_select_scale_add( auto output_stride0 = output.strides()[0]; auto output_stride1 = output.strides()[1]; - auto* scale_data = scale.data_ptr(); + auto* scale_data = scale.const_data_ptr(); auto scale_stride = scale.strides()[0]; for (const auto i : c10::irange(numel)) { @@ -579,15 +579,15 @@ index_select_scale_add( index_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { int64_t ddim = src.size(1); - auto* scale_data = scale.data_ptr(); - auto* select_indices_data = select_indices.data_ptr(); + auto* scale_data = scale.const_data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); auto* output_data = output.data_ptr(); if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) { auto src_contig = src.contiguous(); - auto* src_data = src_contig.data_ptr(); + auto* src_data = src_contig.const_data_ptr(); int64_t output_size = offsets.numel() - 1; - auto* offsets_data = offsets.data_ptr(); + auto* offsets_data = offsets.const_data_ptr(); std::vector offsets_include_last; if (include_last_offset) { @@ -705,8 +705,8 @@ index_select_scale_add( #endif } else { AT_ASSERT(select_indices.numel() == add_indices.numel()); - auto* src_data = src.data_ptr(); - auto* add_indices_data = add_indices.data_ptr(); + auto* src_data = src.const_data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; if (bag_size.defined()) { @@ -770,15 +770,15 @@ index_select_scale_add(const Tensor &select_indices, index_t padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { int64_t ddim = src.size(1); - auto* scale_data = scale.data_ptr(); - auto* select_indices_data = select_indices.data_ptr(); + auto* scale_data = scale.const_data_ptr(); + auto* select_indices_data = select_indices.const_data_ptr(); auto* output_data = output.data_ptr(); if (is_fast_path_index_select_scale(src, scale, output, padding_idx)) { auto src_contig = src.contiguous(); - auto* src_data = src_contig.data_ptr(); + auto* src_data = src_contig.const_data_ptr(); int64_t output_size = offsets.numel() - 1; - auto* offsets_data = offsets.data_ptr(); + auto* offsets_data = offsets.const_data_ptr(); std::vector offsets_include_last; if (include_last_offset) { @@ -844,8 +844,8 @@ index_select_scale_add(const Tensor &select_indices, }); } else { AT_ASSERT(select_indices.numel() == add_indices.numel()); - auto* src_data = src.data_ptr(); - auto* add_indices_data = add_indices.data_ptr(); + auto* src_data = src.const_data_ptr(); + auto* add_indices_data = add_indices.const_data_ptr(); // NOLINTNEXTLINE(cppcoreguidelines-init-variables) index_t* bag_size_data = nullptr; if (bag_size.defined()) { @@ -1089,7 +1089,7 @@ void embedding_bag_cpu_max_out( int64_t featureSize = weight.size(1); int64_t vocab_size = weight.size(0); AT_DISPATCH_INDEX_TYPES(indices.scalar_type(), "embedding_bag_cpu_max_out", [&] { - auto* indices_data = indices.data_ptr(); + auto* indices_data = indices.const_data_ptr(); auto* offset2bag_data = offset2bag.data_ptr(); index_t* max_indices_data = nullptr; @@ -1099,7 +1099,7 @@ void embedding_bag_cpu_max_out( max_indices_stride = max_indices->strides()[0]; } - auto* weight_data = weight.data_ptr(); + auto* weight_data = weight.const_data_ptr(); auto* output_data = output.data_ptr(); auto* bag_size_data = bag_size.data_ptr(); auto weight_stride0 = weight.strides()[0]; @@ -1210,8 +1210,9 @@ static std::tuple _embedding_bag_cpu_impl( TORCH_CHECK(weight.dim() == 2, "weight has to be a 2D Tensor, but got Tensor of dimension ", weight.dim()); - Tensor indices, offsets; - std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_); + auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_); + const auto& indices = *indicesMaybeOwned; + const auto& offsets = *offsetsMaybeOwned; check_arguments(weight, indices, offsets, mode, per_sample_weights, include_last_offset); Tensor output = at::empty( @@ -1331,8 +1332,8 @@ void _embedding_bag_cpu_out( at::Tensor& bag_size, at::Tensor* p_max_indices, const at::Tensor& weight, - const at::Tensor& indices, - const at::Tensor& offsets, + const at::Tensor& indices_, + const at::Tensor& offsets_, const bool /* scale_grad_by_freq */, const int64_t mode, const bool /* sparse */, @@ -1340,6 +1341,9 @@ void _embedding_bag_cpu_out( const bool include_last_offset, const c10::optional& padding_idx, _EmbeddingBagKernelCache* fbgemm_kernel_cache) { + auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_); + const auto& indices = *indicesMaybeOwned; + const auto& offsets = *offsetsMaybeOwned; at::native::check_arguments( weight, indices, offsets, mode, per_sample_weights, include_last_offset); @@ -1410,8 +1414,9 @@ Tensor _embedding_bag_backward_symint(const Tensor &grad, const Tensor &indices_ c10::MaybeOwned per_sample_weights_maybe_owned = at::borrow_from_optional_tensor(per_sample_weights_opt); const Tensor& per_sample_weights = *per_sample_weights_maybe_owned; - Tensor indices, offsets; - std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_); + auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_); + const auto& indices = *indicesMaybeOwned; + const auto& offsets = *offsetsMaybeOwned; auto indices_arg = TensorArg(indices, "indices", 1); checkScalarTypes("embedding_bag", indices_arg, {kLong, kInt}); checkContiguous("embedding_bag", indices_arg); @@ -1473,7 +1478,7 @@ static Tensor _embedding_bag_dense_backward_cpu_max( template static std::vector compute_counts( int64_t num_weights, - index_t* indices_data, + const index_t* indices_data, int64_t indices_length) { std::vector counts(num_weights, 0); for (const auto i : c10::irange(indices_length)) { @@ -1494,7 +1499,7 @@ static std::vector compute_counts( template static std::vector compute_counts_uniq( int64_t num_weights, - index_t* indices_data, + const index_t* indices_data, int64_t indices_length, const std::vector& counts) { std::vector counts_uniq; @@ -1533,11 +1538,11 @@ void _embedding_bag_dense_backward_cpu_sum_mean( optional per_sample_weights; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - scalar_t* per_sample_weights_data; + const scalar_t* per_sample_weights_data; optional per_sample_weights_stride; if (per_sample_weights_.defined()) { per_sample_weights = per_sample_weights_.index_select(0, ind_sort); - per_sample_weights_data = per_sample_weights->data_ptr(); + per_sample_weights_data = per_sample_weights->const_data_ptr(); per_sample_weights_stride = per_sample_weights->strides()[0]; } @@ -1549,9 +1554,9 @@ void _embedding_bag_dense_backward_cpu_sum_mean( [&indices, &offset2bag, &bag_size_, &num_weights, &numel, &per_sample_weights, &per_sample_weights_data, &per_sample_weights_stride, &mode, &scale_grad_by_freq, &grad, &index_grad_weight, &padding_idx] { - auto* indices_data = indices.data_ptr(); - auto* offset2bag_data = offset2bag.data_ptr(); - auto* bag_size_data = bag_size_.data_ptr(); + auto* indices_data = indices.const_data_ptr(); + auto* offset2bag_data = offset2bag.const_data_ptr(); + auto* bag_size_data = bag_size_.const_data_ptr(); auto counts = compute_counts(num_weights, indices_data, numel); auto next_unique_index_idx = @@ -1585,7 +1590,7 @@ void _embedding_bag_dense_backward_cpu_sum_mean( } int64_t ddim = grad.size(1); auto igwd = index_grad_weight.data_ptr(); - auto gd = grad.data_ptr(); + auto gd = grad.const_data_ptr(); at::native::cpublas::axpy(ddim, (scalar_t)scale, gd + ddim * source, 1, igwd + ddim * index, 1); } @@ -1666,8 +1671,10 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( AT_ASSERT(grad.dim() == 2); auto embedding_features = grad.sizes()[1]; - Tensor indices, offsets; - std::tie(indices, offsets) = promoteIndicesAndOffsets(indices_, offsets_); + auto [indicesMaybeOwned, offsetsMaybeOwned] = promoteIndicesAndOffsets(indices_, offsets_); + const auto& indices = *indicesMaybeOwned; + const auto& offsets = *offsetsMaybeOwned; + AT_ASSERT(indices.dim() == 1); auto num_samples = indices.size(0); @@ -1695,11 +1702,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( offset2bag_ = offset2bag; } - auto* grad_data = grad.data_ptr(); + auto* grad_data = grad.const_data_ptr(); auto grad_stride0 = grad.strides()[0]; auto grad_stride1 = grad.strides()[1]; - auto* weight_data = weight.data_ptr(); + auto* weight_data = weight.const_data_ptr(); auto weight_stride0 = weight.strides()[0]; auto weight_stride1 = weight.strides()[1]; @@ -1709,11 +1716,11 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( [&indices, &output, &offset2bag_, &num_samples, &embedding_features, &grad_data, &grad_stride0, &grad_stride1, &weight_data, &weight_stride0, &weight_stride1, &padding_idx] () { - auto* indices_data = indices.data_ptr(); + auto* indices_data = indices.const_data_ptr(); // The following are contiguous auto* output_data = output.data_ptr(); - auto* offset2bag_data = offset2bag_.data_ptr(); + auto* offset2bag_data = offset2bag_.const_data_ptr(); // XXX: 64 was arbitrarily chosen. There is probably a sweet spot for this number. parallel_for(0, num_samples, 64, @@ -1726,8 +1733,8 @@ Tensor _embedding_bag_per_sample_weights_backward_cpu_template( if (embedding_idx != static_cast(padding_idx)) { output_data[sample_idx] = dot_impl( embedding_features, - grad_data + grad_stride0 * bag_idx, grad_stride1, - weight_data + weight_stride0 * embedding_idx, weight_stride1); + const_cast(grad_data + grad_stride0 * bag_idx), grad_stride1, + const_cast(weight_data + weight_stride0 * embedding_idx), weight_stride1); } } }); diff --git a/aten/src/ATen/native/ForeachUtils.h b/aten/src/ATen/native/ForeachUtils.h index 07ad16d3110b0..4e8963da05521 100644 --- a/aten/src/ATen/native/ForeachUtils.h +++ b/aten/src/ATen/native/ForeachUtils.h @@ -216,7 +216,7 @@ inline std::vector convert_tensor_to_scalar_list( scalarList_.scalar_type(), "convert_tensor_to_scalar_list", [&]() { - const scalar_t* scalar_data = scalarList_.data_ptr(); + const scalar_t* scalar_data = scalarList_.const_data_ptr(); TORCH_CHECK( (expect_length == scalarList_.size(0)), "Expected length of scalars to match input of length ", @@ -248,7 +248,7 @@ inline bool can_use_fast_route( } using DeviceDtypeKey = std::pair; -using IndicesT = std::vector; +using IndicesT = std::vector; using nested_optional_tensorvec_t = std::vector>>; using TensorsAndIndicesT = std::pair; diff --git a/aten/src/ATen/native/FractionalMaxPool2d.cpp b/aten/src/ATen/native/FractionalMaxPool2d.cpp index c9797b2e70a53..d1a5808d0c66c 100644 --- a/aten/src/ATen/native/FractionalMaxPool2d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool2d.cpp @@ -131,10 +131,10 @@ namespace { template static void fractional_max_pool2d_out_single_batch_frame( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t* indices, - scalar_t* randomSamples, + const scalar_t* randomSamples, int numPlanes, int inputW, int inputH, int outputW, int outputH, @@ -142,7 +142,7 @@ static void fractional_max_pool2d_out_single_batch_frame( at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { for (const auto plane : c10::irange(start, end)) { /* each plane contains 2 random samples, one for W and one for H */ - scalar_t* randomSamplesForPlane = randomSamples + plane * 2; + const scalar_t* randomSamplesForPlane = randomSamples + plane * 2; /* Generate interval sequence */ auto sequenceW = generate_intervals( @@ -154,7 +154,7 @@ static void fractional_max_pool2d_out_single_batch_frame( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int h, w; - scalar_t* inputForPlane = input + plane * inputW * inputH; + const scalar_t* inputForPlane = input + plane * inputW * inputH; scalar_t* outputForPlane = output + plane * outputW * outputH; int64_t* indicesForPlane = indices + plane * outputW * outputH; @@ -192,10 +192,10 @@ static void fractional_max_pool2d_out_single_batch_frame( template static void fractional_max_pool2d_out_frame( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t* indices, - scalar_t* randomSamples, + const scalar_t* randomSamples, int numBatch, int numPlanes, int inputW, int inputH, int outputW, int outputH, @@ -225,16 +225,16 @@ static void fractional_max_pool2d_out_frame( template static void fractional_max_pool2d_backward_out_single_batch_frame( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int numPlanes, int inputW, int inputH, int outputW, int outputH) { at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { for (const auto plane : c10::irange(start, end)) { scalar_t* gradInputForPlane = gradInput + plane * inputW * inputH; - scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH; - int64_t* indicesForPlane = indices + plane * outputW * outputH; + const scalar_t* gradOutputForPlane = gradOutput + plane * outputW * outputH; + const int64_t* indicesForPlane = indices + plane * outputW * outputH; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int h, w; @@ -254,8 +254,8 @@ static void fractional_max_pool2d_backward_out_single_batch_frame( template static void fractional_max_pool2d_backward_out_frame( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int numBatch, int numPlanes, int inputW, int inputH, int outputW, int outputH) { @@ -326,10 +326,10 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cpu) ( kHalf, input.scalar_type(), "fractional_max_pool2d_out_frame", [&] { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); - auto randomSamples_data = randomSamples.data_ptr(); + auto randomSamples_data = randomSamples.const_data_ptr(); fractional_max_pool2d_out_frame( input_data, output_data, @@ -383,8 +383,8 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cpu) ( kHalf, input.scalar_type(), "fractional_max_pool2d_backward_out_frame", [&] { auto gradInput_data = gradInput.data_ptr(); - auto gradOutput_data = gradOutput.data_ptr(); - auto indices_data = indices.data_ptr(); + auto gradOutput_data = gradOutput.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); fractional_max_pool2d_backward_out_frame( gradInput_data, gradOutput_data, diff --git a/aten/src/ATen/native/FractionalMaxPool3d.cpp b/aten/src/ATen/native/FractionalMaxPool3d.cpp index 9763cebd3ffbd..79da29a6f4462 100644 --- a/aten/src/ATen/native/FractionalMaxPool3d.cpp +++ b/aten/src/ATen/native/FractionalMaxPool3d.cpp @@ -101,10 +101,10 @@ namespace { template static void fractional_max_pool3d_out_single_batch_frame( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t* indices, - scalar_t* randomSamples, + const scalar_t* randomSamples, int64_t numPlanes, int64_t inputT, int64_t inputH, int64_t inputW, int64_t outputT, int64_t outputH, int64_t outputW, @@ -114,7 +114,7 @@ static void fractional_max_pool3d_out_single_batch_frame( for (const auto plane : c10::irange(start, end)) { /* each plane contains 3 random samples, one for T, one for W, and one for H */ - scalar_t* randomSamplesForPlane = randomSamples + plane * 3; + const scalar_t* randomSamplesForPlane = randomSamples + plane * 3; /* Generate interval sequence */ auto sequenceT = generate_intervals( @@ -128,7 +128,7 @@ static void fractional_max_pool3d_out_single_batch_frame( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t t, h, w; - scalar_t* inputForPlane = input + plane * inputT * inputH * inputW; + const scalar_t* inputForPlane = input + plane * inputT * inputH * inputW; scalar_t* outputForPlane = output + plane * outputT * outputH * outputW; int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW; @@ -173,10 +173,10 @@ static void fractional_max_pool3d_out_single_batch_frame( template static void fractional_max_pool3d_out_frame( - scalar_t* input, + const scalar_t* input, scalar_t* output, int64_t* indices, - scalar_t* randomSamples, + const scalar_t* randomSamples, int64_t numBatch, int64_t numPlanes, int64_t inputT, int64_t inputH, int64_t inputW, int64_t outputT, int64_t outputH, int64_t outputW, @@ -244,10 +244,10 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cpu)( "fractional_max_pool3d_out_frame", [&] { fractional_max_pool3d_out_frame( - input.data_ptr(), + input.const_data_ptr(), output.data_ptr(), indices.data_ptr(), - randomSamples.data_ptr(), + randomSamples.const_data_ptr(), numBatch, numPlanes, inputT, inputH, inputW, outputT, outputH, outputW, @@ -262,8 +262,8 @@ namespace { template static void fractional_max_pool3d_backward_out_single_batch_frame( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int64_t numPlanes, int64_t inputT, int64_t inputH, int64_t inputW, int64_t outputT, int64_t outputH, int64_t outputW) { @@ -271,9 +271,9 @@ static void fractional_max_pool3d_backward_out_single_batch_frame( at::parallel_for(0, numPlanes, 0, [&](int64_t start, int64_t end) { for (const auto plane : c10::irange(start, end)) { scalar_t* gradInputForPlane = gradInput + plane * inputT * inputH * inputW; - scalar_t* gradOutputForPlane = gradOutput + + const scalar_t* gradOutputForPlane = gradOutput + plane * outputT * outputH * outputW; - int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW; + const int64_t* indicesForPlane = indices + plane * outputT * outputH * outputW; // NOLINTNEXTLINE(cppcoreguidelines-init-variables) int64_t h, w, t; @@ -294,8 +294,8 @@ static void fractional_max_pool3d_backward_out_single_batch_frame( template static void fractional_max_pool3d_backward_out_frame( scalar_t* gradInput, - scalar_t* gradOutput, - int64_t* indices, + const scalar_t* gradOutput, + const int64_t* indices, int64_t numBatch, int64_t numPlanes, int64_t inputT, int64_t inputH, int64_t inputW, int64_t outputT, int64_t outputH, int64_t outputW) { @@ -381,8 +381,8 @@ void fractional_max_pool3d_backward_out_cpu_template( [&]{ fractional_max_pool3d_backward_out_frame( gradInput.data_ptr(), - gradOutput.data_ptr(), - indices.data_ptr(), + gradOutput.const_data_ptr(), + indices.const_data_ptr(), numBatch, numPlanes, inputT, inputH, inputW, outputT, outputH, outputW diff --git a/aten/src/ATen/native/FusedAdam.cpp b/aten/src/ATen/native/FusedAdam.cpp new file mode 100644 index 0000000000000..b3be769b24f18 --- /dev/null +++ b/aten/src/ATen/native/FusedAdam.cpp @@ -0,0 +1,175 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#include +#endif +namespace at { + +namespace native { + +void _fused_adam_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + const float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + const float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + if (found_inf_ptr && *found_inf_ptr == 1.0) { + return; + } + size_t n_tensors = params.size(); + TORCH_CHECK(grads.size() == n_tensors); + TORCH_CHECK(exp_avgs.size() == n_tensors); + TORCH_CHECK(exp_avg_sqs.size() == n_tensors); + if (amsgrad) { + TORCH_CHECK(max_exp_avg_sqs.size() == n_tensors); + } else { + TORCH_CHECK(max_exp_avg_sqs.size() == 0); + } + TORCH_CHECK(state_steps.size() == n_tensors); + at::Tensor max_exp_avg_sq = at::Tensor(); + for (size_t i = 0; i < n_tensors; i++){ + if (amsgrad) max_exp_avg_sq = max_exp_avg_sqs[i]; + fused_adam_stub( + kCPU, + params[i], + grads[i], + exp_avgs[i], + exp_avg_sqs[i], + max_exp_avg_sq, + state_steps[i], + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + grad_scale_ptr, + ADAM_MODE::ORIGINAL); + } +} + +// The following overload simply has a Tensor lr +void _fused_adam_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + _fused_adam_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf); +} + +void _fused_adamw_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + const float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + const float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + if (found_inf_ptr && *found_inf_ptr == 1.0) { + return; + } + size_t n_tensors = params.size(); + TORCH_CHECK(grads.size() == n_tensors); + TORCH_CHECK(exp_avgs.size() == n_tensors); + TORCH_CHECK(exp_avg_sqs.size() == n_tensors); + if (amsgrad) { + TORCH_CHECK(max_exp_avg_sqs.size() == n_tensors); + } else { + TORCH_CHECK(max_exp_avg_sqs.size() == 0); + } + TORCH_CHECK(state_steps.size() == n_tensors); + at::Tensor max_exp_avg_sq = at::Tensor(); + for (size_t i = 0; i < n_tensors; i++){ + if (amsgrad) max_exp_avg_sq = max_exp_avg_sqs[i]; + fused_adam_stub( + kCPU, + params[i], + grads[i], + exp_avgs[i], + exp_avg_sqs[i], + max_exp_avg_sq, + state_steps[i], + lr, + beta1, + beta2, + weight_decay, + eps, + amsgrad, + maximize, + grad_scale_ptr, + ADAM_MODE::ADAMW); + } +} + +// The following overload simply has a Tensor lr +void _fused_adamw_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList exp_avgs, + at::TensorList exp_avg_sqs, + at::TensorList max_exp_avg_sqs, + at::TensorList state_steps, + const at::Tensor& lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + _fused_adamw_kernel_cpu_(params, grads, exp_avgs, exp_avg_sqs, max_exp_avg_sqs, state_steps, lr.item(), beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale, found_inf); +} + + +DEFINE_DISPATCH(fused_adam_stub); + +} +} diff --git a/aten/src/ATen/native/FusedAdam.h b/aten/src/ATen/native/FusedAdam.h new file mode 100644 index 0000000000000..6fbbaf2441297 --- /dev/null +++ b/aten/src/ATen/native/FusedAdam.h @@ -0,0 +1,30 @@ +#include +#include + +namespace at { + +namespace native { + +enum class ADAM_MODE : uint8_t { ORIGINAL = 0, ADAMW = 1 }; + +using fused_adam_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& exp_avg, + const at::Tensor& exp_avg_sq, + const at::Tensor& max_exp_avg_sq, + const at::Tensor& state_step, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const float* grad_scale_ptr, + const ADAM_MODE); + +DECLARE_DISPATCH(fused_adam_fn, fused_adam_stub); + +} +} diff --git a/aten/src/ATen/native/FusedSGD.cpp b/aten/src/ATen/native/FusedSGD.cpp new file mode 100644 index 0000000000000..56e2e91759113 --- /dev/null +++ b/aten/src/ATen/native/FusedSGD.cpp @@ -0,0 +1,86 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#endif +namespace at { + +namespace native { + + +void _fused_sgd_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + const float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + const float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + if (found_inf_ptr && *found_inf_ptr == 1.0) { + return; + } + size_t n_tensors = params.size(); + TORCH_CHECK(grads.size() == n_tensors); + bool no_momentum_buffer = momentum == 0.0; + if (no_momentum_buffer) { + TORCH_CHECK(momentum_buffer_list.size() == 0); + } else { + TORCH_CHECK(momentum_buffer_list.size() == n_tensors); + } + for (size_t i = 0; i < n_tensors; i++){ + fused_sgd_stub( + kCPU, + params[i], + grads[i], + no_momentum_buffer ? Tensor() : momentum_buffer_list[i], + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr); + } +} + +void _fused_sgd_kernel_cpu_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const at::Tensor& lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + _fused_sgd_kernel_cpu_( + params, grads, momentum_buffer_list, weight_decay, + momentum, lr.item(), dampening, nesterov, + maximize, is_first_step, grad_scale, found_inf + ); +} + +DEFINE_DISPATCH(fused_sgd_stub); + +} +} diff --git a/aten/src/ATen/native/FusedSGD.h b/aten/src/ATen/native/FusedSGD.h new file mode 100644 index 0000000000000..62cd3c8aef73b --- /dev/null +++ b/aten/src/ATen/native/FusedSGD.h @@ -0,0 +1,24 @@ +#include +#include + +namespace at { + +namespace native { + +using fused_sgd_fn = void (*)( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& momentum_buffer, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr); + +DECLARE_DISPATCH(fused_sgd_fn, fused_sgd_stub); + +} +} diff --git a/aten/src/ATen/native/GatedLinearUnit.cpp b/aten/src/ATen/native/GatedLinearUnit.cpp index 73028d12f9c8b..3a4aaab632ced 100644 --- a/aten/src/ATen/native/GatedLinearUnit.cpp +++ b/aten/src/ATen/native/GatedLinearUnit.cpp @@ -71,9 +71,9 @@ Tensor& glu_backward_cpu_out(const Tensor& grad_output, const Tensor& input, // for second gradinput half, can get a better performance by fusion auto iter = at::TensorIteratorConfig() .add_output(gradInputsecondHalf) - .add_input(gradInputfirstHalf) - .add_input(firstHalf) - .add_input(grad_output) + .add_const_input(gradInputfirstHalf) + .add_const_input(firstHalf) + .add_const_input(grad_output) .build(); glu_backward_stub(iter.device_type(), iter); gradInputfirstHalf.mul_(grad_output); @@ -99,10 +99,10 @@ Tensor glu_jvp( auto dglu = at::empty_like(glu); auto iter = at::TensorIteratorConfig() .add_output(dglu) - .add_input(glu) - .add_input(b) - .add_input(da) - .add_input(db) + .add_const_input(glu) + .add_const_input(b) + .add_const_input(da) + .add_const_input(db) .build(); glu_jvp_stub(iter.device_type(), iter); return dglu; diff --git a/aten/src/ATen/native/GridSampler.cpp b/aten/src/ATen/native/GridSampler.cpp index 329f22bd99777..5d0259eeb1ba2 100644 --- a/aten/src/ATen/native/GridSampler.cpp +++ b/aten/src/ATen/native/GridSampler.cpp @@ -75,19 +75,19 @@ namespace { int64_t out_sD = output.stride(2); int64_t out_sH = output.stride(3); int64_t out_sW = output.stride(4); - scalar_t *inp_ptr = input.data_ptr(); + const scalar_t *inp_ptr = input.const_data_ptr(); scalar_t *out_ptr = output.data_ptr(); - scalar_t *grid_ptr = grid.data_ptr(); + const scalar_t *grid_ptr = grid.const_data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { for (const auto n : c10::irange(start, end)) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; for (const auto d : c10::irange(out_D)) { for (const auto h : c10::irange(out_H)) { for (const auto w : c10::irange(out_W)) { // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; + const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; scalar_t ix = *grid_ptr_NDHW; scalar_t iy = grid_ptr_NDHW[grid_sCoor]; scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; @@ -144,7 +144,7 @@ namespace { // calculate bilinear weighted pixel value and set output pixel scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; for (int64_t c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { // (c, iz_tnw, iy_tnw, ix_tnw) * tnw + (c, iz_tne, iy_tne, ix_tne) * tne // + (c, iz_tsw, iy_tsw, ix_tsw) * tsw + (c, iz_tse, iy_tse, ix_tse) * tse @@ -181,9 +181,9 @@ namespace { int64_t iy_nearest = static_cast(std::nearbyint(iy)); int64_t iz_nearest = static_cast(std::nearbyint(iz)); - // assign nearest neighor pixel value to output pixel + // assign nearest neighbour pixel value to output pixel scalar_t *out_ptr_NCDHW = out_ptr + n * out_sN + d * out_sD + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; for (int64_t c = 0; c < C; ++c, out_ptr_NCDHW += out_sC, inp_ptr_NC += inp_sC) { if (within_bounds_3d(iz_nearest, iy_nearest, ix_nearest, inp_D, inp_H, inp_W)) { *out_ptr_NCDHW = inp_ptr_NC[iz_nearest * inp_sD + iy_nearest * inp_sH + ix_nearest * inp_sW]; @@ -268,9 +268,9 @@ namespace { } int64_t gGrid_sN = grad_grid.stride(0); int64_t gGrid_sW = grad_grid.stride(3); - scalar_t *inp_ptr = input.data_ptr(); - scalar_t *grid_ptr = grid.data_ptr(); - scalar_t *gOut_ptr = grad_output.data_ptr(); + const scalar_t *inp_ptr = input.const_data_ptr(); + const scalar_t *grid_ptr = grid.const_data_ptr(); + const scalar_t *gOut_ptr = grad_output.const_data_ptr(); scalar_t *gInp_ptr = nullptr; if (input_requires_grad) { gInp_ptr = grad_input.mutable_data_ptr(); @@ -279,14 +279,14 @@ namespace { // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { for (const auto n : c10::irange(start, end)) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; scalar_t *gGrid_ptr_NDHW = gGrid_ptr + n * gGrid_sN; for (const auto d : c10::irange(out_D)) { for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NDHW += gGrid_sW /* grad_grid is contiguous */ ) { // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; + const scalar_t *grid_ptr_NDHW = grid_ptr_N + d * grid_sD + h * grid_sH + w * grid_sW; scalar_t ix = *grid_ptr_NDHW; scalar_t iy = grid_ptr_NDHW[grid_sCoor]; scalar_t iz = grid_ptr_NDHW[2 * grid_sCoor]; @@ -344,8 +344,8 @@ namespace { scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); - scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + const scalar_t *inp_ptr_NC = inp_ptr_N; scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; // calculate bilinear weighted pixel value and set output pixel for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { @@ -422,8 +422,8 @@ namespace { int64_t iy_nearest = static_cast(std::nearbyint(iy)); int64_t iz_nearest = static_cast(std::nearbyint(iz)); - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + // assign nearest neighbour pixel value to output pixel + const scalar_t *gOut_ptr_NCDHW = gOut_ptr + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; if (input_requires_grad) { scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; for (int64_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, gInp_ptr_NC += gInp_sC) { @@ -589,18 +589,18 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, int64_t out_sC = output.stride(1); int64_t out_sH = output.stride(2); int64_t out_sW = output.stride(3); - scalar_t *inp_ptr = input.data_ptr(); + const scalar_t *inp_ptr = input.const_data_ptr(); scalar_t *out_ptr = output.data_ptr(); - scalar_t *grid_ptr = grid.data_ptr(); + const scalar_t *grid_ptr = grid.const_data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { for (const auto n : c10::irange(start, end)) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; for (const auto h : c10::irange(out_H)) { for (const auto w : c10::irange(out_W)) { // get the corresponding input x, y, z co-ordinates from grid - scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; + const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; scalar_t x = *grid_ptr_NHW; scalar_t y = grid_ptr_NHW[grid_sCoor]; @@ -630,7 +630,7 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, scalar_t se = (ix - ix_nw) * (iy - iy_nw); // calculate bilinear weighted pixel value and set output pixel - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { auto res = static_cast(0); @@ -652,9 +652,9 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, int64_t ix_nearest = static_cast(std::nearbyint(ix)); int64_t iy_nearest = static_cast(std::nearbyint(iy)); - // assign nearest neighor pixel value to output pixel + // assign nearest neighbour pixel value to output pixel scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { if (within_bounds_2d(iy_nearest, ix_nearest, inp_H, inp_W)) { *out_ptr_NCHW = inp_ptr_NC[iy_nearest * inp_sH + ix_nearest * inp_sW]; @@ -676,13 +676,13 @@ Tensor _grid_sampler_2d_cpu_fallback(const Tensor& input, const Tensor& grid, const scalar_t tx = ix - ix_nw; const scalar_t ty = iy - iy_nw; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; scalar_t *out_ptr_NCHW = out_ptr + n * out_sN + h * out_sH + w * out_sW; for (int64_t c = 0; c < C; ++c, out_ptr_NCHW += out_sC, inp_ptr_NC += inp_sC) { // NOLINTNEXTLINE(modernize-avoid-c-arrays,cppcoreguidelines-avoid-c-arrays) scalar_t coefficients[4]; - // Interpolate 4 values in the x directon + // Interpolate 4 values in the x direction for (const auto i : c10::irange(4)) { coefficients[i] = cubic_interp1d( get_value_bounded(inp_ptr_NC, ix_nw - 1, iy_nw - 1 + i, inp_W, inp_H, inp_sW, inp_sH, padding_mode, align_corners), @@ -758,21 +758,21 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, int64_t gInp_sW = grad_input.stride(3); int64_t gGrid_sN = grad_grid.stride(0); int64_t gGrid_sW = grad_grid.stride(2); - scalar_t *inp_ptr = input.data_ptr(); - scalar_t *grid_ptr = grid.data_ptr(); - scalar_t *gOut_ptr = grad_output.data_ptr(); + const scalar_t *inp_ptr = input.const_data_ptr(); + const scalar_t *grid_ptr = grid.const_data_ptr(); + const scalar_t *gOut_ptr = grad_output.const_data_ptr(); scalar_t *gInp_ptr = grad_input.mutable_data_ptr(); scalar_t *gGrid_ptr = grad_grid.data_ptr(); // loop over each output pixel at::parallel_for(0, N, 0, [&](int64_t start, int64_t end) { for (const auto n : c10::irange(start, end)) { - scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; - scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; + const scalar_t *grid_ptr_N = grid_ptr + n * grid_sN; + const scalar_t *inp_ptr_N = inp_ptr + n * inp_sN; scalar_t *gGrid_ptr_NHW = gGrid_ptr + n * gGrid_sN; for (const auto h : c10::irange(out_H)) { for (int64_t w = 0; w < out_W; ++w, gGrid_ptr_NHW += gGrid_sW /* grad_grid is contiguous */ ) { // get the corresponding input x, y co-ordinates from grid - scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; + const scalar_t *grid_ptr_NHW = grid_ptr_N + h * grid_sH + w * grid_sW; scalar_t x = *grid_ptr_NHW; scalar_t y = grid_ptr_NHW[grid_sCoor]; @@ -804,9 +804,9 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, scalar_t se = (ix - ix_nw) * (iy - iy_nw); scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; + const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; // calculate bilinear weighted pixel value and set output pixel for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC += inp_sC) { scalar_t gOut = *gOut_ptr_NCHW; @@ -847,8 +847,8 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, int64_t ix_nearest = static_cast(std::nearbyint(ix)); int64_t iy_nearest = static_cast(std::nearbyint(iy)); - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; + // assign nearest neighbour pixel value to output pixel + const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC) { // calculate and set grad_input @@ -883,9 +883,9 @@ _grid_sampler_2d_cpu_fallback_backward(const Tensor& grad_output, scalar_t gix = static_cast(0); scalar_t giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; + const scalar_t *gOut_ptr_NCHW = gOut_ptr + n * gOut_sN + h * gOut_sH + w * gOut_sW; scalar_t *gInp_ptr_NC = gInp_ptr + n * gInp_sN; - scalar_t *inp_ptr_NC = inp_ptr_N; + const scalar_t *inp_ptr_NC = inp_ptr_N; for (int64_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, gInp_ptr_NC += gInp_sC, inp_ptr_NC+= inp_sC) { scalar_t gOut = *gOut_ptr_NCHW; diff --git a/aten/src/ATen/native/GridSampler.h b/aten/src/ATen/native/GridSampler.h index aaeb7331c3e88..509a305fe4b5e 100644 --- a/aten/src/ATen/native/GridSampler.h +++ b/aten/src/ATen/native/GridSampler.h @@ -211,7 +211,7 @@ static inline bool within_bounds_3d(int64_t d, int64_t h, int64_t w, int64_t D, template static inline scalar_t get_value_bounded( - scalar_t* data, + const scalar_t* data, scalar_t x, scalar_t y, int64_t W, diff --git a/aten/src/ATen/native/LegacyBatching.cpp b/aten/src/ATen/native/LegacyBatching.cpp index cb461932b9a64..8aa08a875f7d9 100644 --- a/aten/src/ATen/native/LegacyBatching.cpp +++ b/aten/src/ATen/native/LegacyBatching.cpp @@ -115,10 +115,7 @@ Tensor _remove_batch_dim(const Tensor& self, int64_t level, int64_t batch_size, const auto* batched = maybeGetBatchedImpl(self); TORCH_INTERNAL_ASSERT(batched != nullptr); - Tensor self_without_bdim; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t newly_exposed_logical_dim; - std::tie(self_without_bdim, newly_exposed_logical_dim) = remove_existing_batch_dim(batched, level); + auto [self_without_bdim, newly_exposed_logical_dim] = remove_existing_batch_dim(batched, level); return maybe_movedim(self_without_bdim, newly_exposed_logical_dim, out_dim); } diff --git a/aten/src/ATen/native/Lerp.cpp b/aten/src/ATen/native/Lerp.cpp index b7e04f0841311..cded246ec7bad 100644 --- a/aten/src/ATen/native/Lerp.cpp +++ b/aten/src/ATen/native/Lerp.cpp @@ -20,9 +20,9 @@ TORCH_META_FUNC(lerp_Tensor)( " for `weight` but got dtype ", weight.dtype()); build(at::TensorIteratorConfig() .add_output(maybe_get_output()) - .add_input(self) - .add_input(end) - .add_input(weight)); + .add_const_input(self) + .add_const_input(end) + .add_const_input(weight)); } TORCH_META_FUNC(lerp_Scalar)( diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index 8dfcff2342069..9322776b03f5a 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -30,6 +30,7 @@ #endif #include +#include #include #include #include @@ -70,6 +71,14 @@ static inline Tensor _flatten_nd_linear(const Tensor& input, const Tensor& weigh Tensor linear(const Tensor& input, const Tensor& weight, const c10::optional& bias_opt) { + // _matmul_impl checks this again later, but _flatten_nd_linear does not work on scalars inputs, + // so let's try to catch this here already + const auto input_dim = input.dim(); + const auto weight_dim = weight.dim(); + TORCH_CHECK(input_dim != 0 && weight_dim != 0, + "both arguments to linear need to be at least 1D, but they are ", + input_dim, "D and ", weight_dim, "D"); + // See [Note: hacky wrapper removal for optional tensor] auto bias = bias_opt.has_value() ? c10::MaybeOwned::borrowed(*bias_opt) @@ -82,7 +91,6 @@ Tensor linear(const Tensor& input, const Tensor& weight, const c10::optionaldefined()) { // Fused op is marginally faster. return at::addmm(*bias, input, weight.t()); @@ -703,6 +711,28 @@ Tensor bilinear(const Tensor& input1, const Tensor& input2, const Tensor& weight // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned bias_maybe_owned = at::borrow_from_optional_tensor(bias_opt); const Tensor& bias = *bias_maybe_owned; + if (bias.defined()) { + TORCH_CHECK( + input1.dtype() == input2.dtype() && input1.dtype() == weight.dtype() && + input1.dtype() == bias.dtype(), + "All tensors must have the same dtype, got input1: ", + input1.dtype(), + ", input2: ", + input2.dtype(), + ", weight: ", + weight.dtype(), + ", bias: ", + bias.dtype()); + } else { + TORCH_CHECK( + input1.dtype() == input2.dtype() && input1.dtype() == weight.dtype(), + "All tensors must have the same dtype, got input1: ", + input1.dtype(), + ", input2: ", + input2.dtype(), + ", weight: ", + weight.dtype()); + } TORCH_CHECK(input1.dim() == input2.dim(), "bilinear(): input dimensions do not match: got ", input1.dim(), " and ", input2.dim()); for (const auto i : c10::irange(input1.dim() - 1)) { @@ -785,7 +815,7 @@ Tensor tensordot(const Tensor& input1, const Tensor& input2, IntArrayRef dims1, rsizes.emplace_back(t2.sym_size(i)); } } - // permut and reshape for matrix multiplication + // permute and reshape for matrix multiplication t1 = t1.permute(p1).reshape_symint({size1, csize}); t2 = t2.permute(p2).reshape_symint({csize, size2}); // multiply and reshape to target size diff --git a/aten/src/ATen/native/LinearAlgebra.cpp b/aten/src/ATen/native/LinearAlgebra.cpp index 397b546fbb0f6..81f461f6c95b8 100644 --- a/aten/src/ATen/native/LinearAlgebra.cpp +++ b/aten/src/ATen/native/LinearAlgebra.cpp @@ -12,12 +12,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -28,12 +30,17 @@ #else #include #include +#include +#include #include #include #include #include #include #include +#include +#include +#include #include #include #include @@ -114,6 +121,7 @@ #include #include #include +#include #include #include #include @@ -421,8 +429,7 @@ std::tuple slogdet_out(const Tensor& A, Tensor& sign, Tensor& Tensor logdet(const Tensor& A) { squareCheckInputs(A, "logdet"); checkFloatingOrComplex(A, "logdet", /*low_precision*/false); - Tensor sign, logabsdet; - std::tie(sign, logabsdet) = at::linalg_slogdet(A); + auto [sign, logabsdet] = at::linalg_slogdet(A); if (A.is_complex()) { return sign.log() + logabsdet; @@ -443,7 +450,12 @@ std::tuple get_atol_rtol( const optional& atol_opt, const optional& rtol_opt, const c10::string_view function_name) { - auto options = input.options().dtype(ScalarType::Double); + auto options = input.options(); + if (input.device().type() == kMetal || input.device().type() == kMPS) { + options = options.dtype(ScalarType::Float); + } else { + options = options.dtype(ScalarType::Double); + } auto atol = atol_opt.has_value() ? atol_opt.value() : at::zeros({}, options); checkNotComplexTolerance(atol, function_name, "atol"); Tensor rtol; @@ -464,7 +476,7 @@ std::tuple get_atol_rtol( const Tensor& input, optional atol_opt, optional rtol_opt) { - double atol = atol_opt.has_value() ? atol_opt.value() : 0.0; + auto atol = atol_opt.has_value() ? atol_opt.value() : 0.0; c10::SymFloat rtol; if (rtol_opt.has_value()) { rtol = rtol_opt.value(); @@ -475,7 +487,12 @@ std::tuple get_atol_rtol( ? 0.0 : default_rtol; } - auto options = input.options().dtype(ScalarType::Double); + auto options = input.options(); + if (input.device().type() == kMetal || input.device().type() == kMPS) { + options = options.dtype(ScalarType::Float); + } else { + options = options.dtype(ScalarType::Double); + } auto atol_tensor = at::full({}, atol, options); auto rtol_tensor = at::full({}, rtol, options); return std::make_tuple(atol_tensor, rtol_tensor); @@ -498,32 +515,28 @@ Tensor linalg_pinv( "linalg.pinv(", t, "{", input.sizes(), "}): expected a tensor with 2 or more dimensions " "of float, double, cfloat or cdouble types"); - Tensor atol, rtol; - std::tie(atol, rtol) = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.pinv"); + auto [atol, rtol] = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.pinv"); if (input.sym_numel() == 0) { // The implementation below uses operations that do not work for zero numel tensors // therefore we need this early return for 'input.numel() == 0' case - Tensor U, S, V; // TODO: replace input.svd with linalg_svd when torch/xla can work with at::linalg_svd - std::tie(U, S, V) = input.svd(); + auto [U, S, V] = input.svd(); return at::matmul(V * S.reciprocal().unsqueeze(-2), U.mH()); } // If not Hermitian use singular value decomposition, else use eigenvalue decomposition if (!hermitian) { - Tensor U, S, V; // TODO: replace input.svd with linalg_svd // using linalg_svd breaks pytorch/xla, see https://github.com/pytorch/xla/issues/2755 - std::tie(U, S, V) = input.svd(); + auto [U, S, V] = input.svd(); Tensor max_val = at::narrow(S, /*dim=*/-1, /*start=*/0, /*length=*/1); // singular values are sorted in descending order Tensor tol = at::max(atol.unsqueeze(-1), rtol.unsqueeze(-1) * max_val); Tensor S_pseudoinv = at::where(S > tol, S.reciprocal(), at::zeros({}, S.options())).to(input.dtype()); // computes V @ diag(S_pseudoinv) @ U.conj().T return at::matmul(V * S_pseudoinv.unsqueeze(-2), U.mH()); } else { - Tensor S, U; - std::tie(S, U) = at::linalg_eigh(input); + auto [S, U] = at::linalg_eigh(input); // For Hermitian matrices, singular values equal to abs(eigenvalues) Tensor S_abs = S.abs(); // eigenvalues are sorted in ascending order starting with negative values, we need a maximum value of abs(eigenvalues) @@ -536,15 +549,19 @@ Tensor linalg_pinv( } Tensor linalg_pinv(const Tensor& input, optional atol, optional rtol, bool hermitian) { - Tensor atol_tensor, rtol_tensor; - std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol); + auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol); return at::linalg_pinv(input, atol_tensor, rtol_tensor, hermitian); } Tensor linalg_pinv(const Tensor& input, const Tensor& rcond, bool hermitian) { // For NumPy compatibility the rcond argument is used as relative tolerance checkNotComplexTolerance(rcond, "torch.linalg.pinv", "rcond"); - auto options = input.options().dtype(ScalarType::Double); + auto options = input.options(); + if (input.device().type() == kMetal || input.device().type() == kMPS) { + options = options.dtype(ScalarType::Float); + } else { + options = options.dtype(ScalarType::Double); + } return at::linalg_pinv(input, at::zeros({}, options), rcond, hermitian); } @@ -713,8 +730,7 @@ Tensor& matrix_rank_impl( const optional& rtol_opt, bool hermitian, Tensor& result) { - Tensor atol, rtol; - std::tie(atol, rtol) = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.matrix_rank"); + auto [atol, rtol] = get_atol_rtol(input, atol_opt, rtol_opt, "torch.linalg.matrix_rank"); checkSameDevice("torch.linalg.matrix_rank", result, input); checkSameDevice("torch.linalg.matrix_rank", atol, input, "atol"); @@ -788,8 +804,7 @@ Tensor& linalg_matrix_rank_out( } Tensor& linalg_matrix_rank_out(const Tensor& input, optional atol, optional rtol, bool hermitian, Tensor& result) { - Tensor atol_tensor, rtol_tensor; - std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol); + auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol); result = linalg_matrix_rank_out(input, atol_tensor, rtol_tensor, hermitian, result); return result; } @@ -802,8 +817,7 @@ Tensor linalg_matrix_rank(const Tensor& input, const optional& atol, con Tensor linalg_matrix_rank(const Tensor& input, optional atol, optional rtol, bool hermitian) { auto result = get_matrix_rank_result_tensor(input); - Tensor atol_tensor, rtol_tensor; - std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, atol, rtol); + auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, atol, rtol); return matrix_rank_impl(input, atol_tensor, rtol_tensor, hermitian, result); } @@ -831,8 +845,7 @@ Tensor linalg_matrix_rank(const Tensor& input, const Tensor& tol, bool hermitian Tensor linalg_matrix_rank(const Tensor& input, double tol, bool hermitian) { auto result = get_matrix_rank_result_tensor(input); - Tensor atol_tensor, rtol_tensor; - std::tie(atol_tensor, rtol_tensor) = get_atol_rtol(input, tol, 0.0); + auto [atol_tensor, rtol_tensor] = get_atol_rtol(input, tol, 0.0); return matrix_rank_impl(input, atol_tensor, rtol_tensor, hermitian, result); } @@ -1016,7 +1029,7 @@ Tensor multi_dot_impl(TensorList _tensors, c10::optional _out) { // If the last and last tensors have shapes (a, b) and (b, c) the // output has shape (a, c). If either the first or last tensor is 1D - // a and/or c dimensions will be implicitely size 1 and will be ommited + // a and/or c dimensions will be implicitly size 1 and will be omitted // from the output. e.g. for inputs (a, b) x (b) the output has shape (a,). at::native::resize_output(out, out_shape); @@ -1166,9 +1179,9 @@ static TensorIterator build_addr_iter(Tensor& result, auto iter = TensorIteratorConfig() .set_check_mem_overlap(true) .add_output(result) - .add_owned_input(*self_) - .add_owned_input(vec1.reshape({vec1_size0, 1})) - .add_input(vec2) + .add_owned_const_input(*self_) + .add_owned_const_input(vec1.reshape({vec1_size0, 1})) + .add_const_input(vec2) .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) @@ -1323,15 +1336,22 @@ Tensor outer(const Tensor& self, const Tensor& vec2) { #if !defined(C10_MOBILE) -#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ - kBFloat16, kHalf, kFloat8_e5m2, kFloat8_e4m3fn, \ +#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6( \ + kBFloat16, kHalf, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \ + TYPE, NAME, __VA_ARGS__) +#else +// Include half dtype in ADDMM. Used to build ExecuTorch in xplat. +#if defined(C10_MOBILE_HALF) +#define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND2(kBFloat16, kHalf, \ TYPE, NAME, __VA_ARGS__) #else #define _AT_DISPATCH_ADDMM_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND(kBFloat16, \ TYPE, NAME, __VA_ARGS__) #endif +#endif static inline int64_t get_mkldnn_matmul_min_dim() { @@ -1635,8 +1655,8 @@ inline void baddbmm_cpu_kernel(const Tensor& result, const Tensor& self, const T opmath_t beta = beta_.to(); auto r0 = result.accessor(); - auto s0 = self.accessor(); - auto m0 = mat2.accessor(); + auto s0 = self.accessor(); + auto m0 = mat2.accessor(); int64_t grain_size = std::max(internal::GRAIN_SIZE / (is * js * ks), (int64_t)1); using opmath_t = at::opmath_type; @@ -1705,8 +1725,8 @@ static void baddbmm_with_gemm_(const Tensor &result, const Tensor &mat1, const T transpose_a ? TransposeType::Transpose : TransposeType::NoTranspose, transpose_b ? TransposeType::Transpose : TransposeType::NoTranspose, batch_size, m, n, k, alpha, - mat2.data_ptr(), lda, mat2_strides[0], - mat1.data_ptr(), ldb, mat1_strides[0], + mat2.const_data_ptr(), lda, mat2_strides[0], + mat1.const_data_ptr(), ldb, mat1_strides[0], beta, result.data_ptr(), ldc, result_strides[0]); }); @@ -1756,7 +1776,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens }; bool apply_heur = apply_mkldnn_matmul_heur(batch1.sizes()[1], batch1.sizes()[2], batch2.sizes()[2]); - if (apply_heur && use_mkldnn_lower_precision_matmul(batch1, batch2, self_or_result)) { + if (apply_heur && use_mkldnn_matmul(batch1, batch2, self_or_result)) { try { mkldnn_matmul(batch1, batch2, self_or_result, beta.to(), alpha.to()); return; @@ -1796,7 +1816,7 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens * vs. other threads, leading to undefined behavior. * Thus it is recommended to not use at::parallel_for where lambdas do * ops that go through dispatcher. - * For now we circument this by InferenceMode guard in order to unlock + * For now we circumvent this by InferenceMode guard in order to unlock * performance. * Longer term we probably want a separate API that explicitly calls out * the TLS that it propagates. @@ -1822,6 +1842,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens r, r, batch1.select(0, b), batch2.select(0, b), 0, 1); } }; + // Materialize if COW, since we cannot do so during parallel_for + self_or_result.mutable_data_ptr(); at::parallel_for(0, bs, 1, bmm_out_fn); } else { for (const auto b : c10::irange(bs)) { @@ -1838,6 +1860,8 @@ static inline void bmm_out_or_baddbmm_(const Tensor& self_or_result_, const Tens batch1.select(0, b), batch2.select(0, b), beta, alpha); } }; + // Materialize if COW, since we cannot do so during parallel_for + self_or_result.mutable_data_ptr(); at::parallel_for(0, bs, 1, bmm_fn); } else { for (const auto b : c10::irange(bs)) { @@ -1907,7 +1931,7 @@ Tensor& vdot_out(const Tensor& self, const Tensor& other, Tensor& result) { return result.fill_(self.vdot(other)); } -static bool should_fold(const Tensor& tensor1, const Tensor& tensor2) { +static bool should_fold(const Tensor& tensor1, const Tensor& tensor2, bool has_out) { // We check that we can fold the larger tensor into a matrix and dispatch to mm or mv rather than // to bmm. We want to make sure we can do so without incurring in any extra copy const auto tensor1_larger = tensor1.dim() >= tensor2.dim(); @@ -1933,10 +1957,13 @@ static bool should_fold(const Tensor& tensor1, const Tensor& tensor2) { // The output gradient g of this operation would have shape [b, m, k] // The backward wrt. t2 of bmm would be given by t1.mH @ g, which has shape [b, n, k] // Then, the backward of expand is simply `sum(0)`. As such, we are instantiating a tensor - // of shape [b, n, k] unnacessarily, which may cause a large memory footprint, and in the + // of shape [b, n, k] unnecessarily, which may cause a large memory footprint, and in the // worst case, an OOM bool t2_requires_grad = tensor1_larger ? tensor2.requires_grad() : tensor1.requires_grad(); - if (t2_requires_grad) { + if (t2_requires_grad && !has_out) { + // We should be checking !at::GradMode::is_enabled(), but apparently + // this regresses performance in some cases: + // https://github.com/pytorch/pytorch/issues/118548#issuecomment-1916022394 return true; } @@ -1995,6 +2022,15 @@ static Tensor _matmul_impl( const bool has_out = out.defined(); + if (has_out) { + // Usually we would rely on the out= kernels we decompose into to check this, but + // for matmul there is logic at the composite level that relies on this invariant. + TORCH_CHECK(!(tensor1.requires_grad() || tensor2.requires_grad() || out.requires_grad()) || !at::GradMode::is_enabled(), + "matmul(): functions with out=... arguments don't support automatic differentiation, " + "but one of the arguments requires grad." + ); + } + if (dim_tensor1 == 1 && dim_tensor2 == 1) { return has_out ? at::dot_out(out, tensor1, tensor2) : tensor1.dot(tensor2); } else if (dim_tensor1 == 2 && dim_tensor2 == 1) { @@ -2004,7 +2040,7 @@ static Tensor _matmul_impl( : tensor1.unsqueeze(0).mm(tensor2).squeeze_(0); } else if (dim_tensor1 == 2 && dim_tensor2 == 2) { return has_out ? at::mm_out(out, tensor1, tensor2) : tensor1.mm(tensor2); - } else if (should_fold(tensor1, tensor2)) { + } else if (should_fold(tensor1, tensor2, has_out)) { // dim_tensor1 >=3 && (dim_tensor2 == 1 || dim_tensor2 == 2) || // dim_tensor2 >=3 && (dim_tensor1 == 1 || dim_tensor1 == 2) // and at least one of the following two conditions hold @@ -2553,10 +2589,9 @@ Tensor compute_T18_scale_square( // gives us an opportunity to calculate the matrix multiplication in a batch. // The first thing we need to do is sort tensor `s`, which will be helpful to // do the matrix multiplication by range. - Tensor sorted_s, sorted_s_inds; // With above example, `sorted_s` is [0, 1, 1, 4], we also will need the index // info, so we can use it to compose the result back. - std::tie(sorted_s, sorted_s_inds) = at::sort(s, /*dim=*/0); + auto [sorted_s, sorted_s_inds] = at::sort(s, /*dim=*/0); sorted_s = sorted_s.to(at::kLong); // Then we call `unique_consecutive` and we will use it to split `sorted_s`, // with above example, `split_counts` is [1, 2, 1]. @@ -2575,10 +2610,10 @@ Tensor compute_T18_scale_square( TORCH_INTERNAL_ASSERT(section_values.is_contiguous()); const auto section_numel = section_values.numel() / 2; - auto scs = section_values.data_ptr(); + auto scs = section_values. template data_ptr(); auto pts = &scs[section_numel]; - // We now will do the matrix muplication in a batch, with above example: + // We now will do the matrix multiplication in a batch, with above example: // 1. Multiply all matrices by 0 (`mul_times[0]`) times, then do `slice` // to get the remain matrices by acc[1:] (`split_counts[0]`), // 2. Multiply remain matrices by 1 times and slice to acc[2:] @@ -2737,7 +2772,7 @@ Tensor backward_analytic_function_of_a_matrix( } // end anon namespace // Computes the matrix exponential for a given batch of squared matrices. -// The implementaion is based on: +// The implementation is based on: // // Bader, P.; Blanes, S.; Casas, F. // Computing the Matrix Exponential with an Optimized Taylor Polynomial Approximation. @@ -2782,13 +2817,49 @@ TORCH_IMPL_FUNC(linalg_vector_norm_out)(const Tensor& self, const Scalar& scalar // values larger than 10^53 (same for negative numbers), so that's fine. auto ord = scalar_ord.toDouble(); auto dim = opt_dim.value_or(IntArrayRef{}); + auto size = self.sizes(); + auto ndim = self.dim(); + + auto opt_dim_ = dim.vec(); + maybe_wrap_dims(opt_dim_, ndim); + + using Int = IntArrayRef::value_type; + std::vector all_dim(ndim); + std::iota(all_dim.begin(), all_dim.end(), 0); + + bool is_all_reduce = !opt_dim.has_value() || opt_dim.value().empty(); + auto reduce_dim = is_all_reduce ? all_dim : opt_dim_; + + bool is_reduce_over_1D_vector = true; + for (auto i : reduce_dim) { + if (size[i] != 1){ + is_reduce_over_1D_vector = false; + break; + } + } + + if (is_reduce_over_1D_vector) { + Tensor self_; + if (opt_dtype.has_value()) { + self_ = self.to(*opt_dtype); + } else { + self_ = self; + } + if (ord != 0.0) { + keepdim ? at::abs_outf(self_, const_cast(result)) : at::abs_outf(self_.squeeze(reduce_dim), const_cast(result)); + } else { + keepdim ? at::ne_outf(self_, 0, const_cast(result)) : at::ne_outf(self_.squeeze(reduce_dim), 0, const_cast(result)); + } + return; + } + // No need to handle opt_dtype explicitly as it is already encoded in the dtype of result // https://github.com/pytorch/pytorch/issues/52648 // Reductions always use `std::abs` to compute the absolute value. In the backward of this // function, we need to locate the index that was selected as the largest value. To do so // we do self.abs() == result to locate the index of the largest element. - // Now, self.abs() may dispatch to a vectorized implementation which gives sliiightly different + // Now, self.abs() may dispatch to a vectorized implementation which gives slightly different // results to the std::abs(std::complex) implementation. // As such, to be able to compute the correct index in the backward, we need to use self.abs() // both in the forward and in the backward @@ -3360,5 +3431,178 @@ Tensor kron(const Tensor& self, const Tensor& other) { return KronImpl(self, other).kron(); } +// Weight Only Quantization Gemm +DEFINE_DISPATCH(weight_to_int4pack_stub); +DEFINE_DISPATCH(int4pack_mm_stub); +DEFINE_DISPATCH(int8pack_mm_stub); + +Tensor _convert_weight_to_int4pack_cpu( + const Tensor& in, + int64_t innerKTiles) { + + TORCH_CHECK(in.dim() == 2, + __func__, " : expect weight to be 2D tensor."); + TORCH_CHECK(in.dtype() == at::kInt, + __func__, " : expect weight to be kInt."); + TORCH_CHECK(innerKTiles == 2 || innerKTiles == 4 || innerKTiles == 8, + __func__, " : innerKTiles need to be 2, 4, or 8, got ", innerKTiles); + + auto weight = in.contiguous(); + auto N = weight.size(0); + auto K = weight.size(1); + + // Create fake shapes for cpu. The meta registration in dynamo requires + // operator has the same output shape for each device. So creating a fake + // shape {N / 8, K / (16 * innerKTiles), 32, innerKTiles / 2} + constexpr int64_t kNTileSize = 8; + constexpr int64_t kKTileSize = 16; + auto nTiles = (N + kNTileSize - 1) / kNTileSize; + + TORCH_CHECK(N % 16 == 0, + __func__, " : expect N to be dividable by 16"); + const int64_t kSuperKTileSize = kKTileSize * innerKTiles; + TORCH_CHECK( K % kSuperKTileSize == 0, + __func__, " : epxect K to be dividable by ", kSuperKTileSize); + auto kSuperTiles = (K + kSuperKTileSize - 1) / kSuperKTileSize; + + auto weight_packed = at::empty( + {nTiles, kSuperTiles, 32, innerKTiles / 2}, + at::TensorOptions().dtype(at::kInt)); + + weight_to_int4pack_stub(kCPU, weight_packed, weight, N, K); + return weight_packed; +} + +Tensor _weight_int4pack_mm_cpu( + const Tensor& A, + const Tensor& B, + int64_t qGroupSize, + const Tensor& qScaleAndZeros) { + + constexpr int64_t kNTileSize = 8; + + auto M = A.size(0); + auto N = B.size(0) * kNTileSize; + auto K = A.size(1); + + TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat, + __func__, " : expect A to be either 32-bit or 16-bit float tensor."); + TORCH_CHECK(A.is_contiguous(), + __func__, " : expect A to be contiguous."); + TORCH_CHECK(A.dim() == 2, + __func__, " : expect A to be 2D tensor."); + + TORCH_CHECK(B.dtype() == kInt, + __func__, " : expect B to be int32 tensor."); + TORCH_CHECK(B.is_contiguous(), + __func__, " : expect B to be contiguous."); + TORCH_CHECK(B.dim() == 4, + __func__, " : expect B to 4d tensor."); + + TORCH_CHECK(qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128 + || qGroupSize == 256, + __func__, ": expect qGroupSize to be 32, 64, 128 or 256, got ", qGroupSize); + + TORCH_CHECK(qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N + && qScaleAndZeros.size(2) == 2, + __func__, ": expect qScaleAndZeros to be 3d tensor with sizes [:, ", N, ", 2]"); + + auto C = at::empty({M, N}, A.options()); + int4pack_mm_stub(kCPU, C, A, B, qGroupSize, qScaleAndZeros, N, K); + + return C; +} + +Tensor _weight_int8pack_mm_cpu( + const Tensor& A, + const Tensor& B, + const Tensor& scales) { + + auto M = A.size(0); + auto N = B.size(0); + auto K = A.size(1); + + TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat, + __func__, " : expect A to be either 32-bit or 16-bit float tensor."); + TORCH_CHECK(A.is_contiguous(), + __func__, " : expect A to be contiguous."); + TORCH_CHECK(A.dim() == 2, + __func__, " : expect A to be 2D tensor."); + + TORCH_CHECK(B.dtype() == kChar, + __func__, " : expect B to be int8 tensor."); + TORCH_CHECK(B.is_contiguous(), + __func__, " : expect B to be contiguous."); + TORCH_CHECK(B.size(1) == K, + __func__, " : expect B.size(1) == ", K); + + TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, + __func__, " : expect scales to be 1d tensor with size ", N); + + auto C = at::empty({M, N}, A.options()); + int8pack_mm_stub(kCPU, C, A, B, scales); + + return C; +} + +Tensor& _int_mm_out_cpu(const Tensor& self, const Tensor& mat2, Tensor& result) { + static constexpr c10::string_view func_name = "int_mm_out_cpu"; + TORCH_CHECK(self.dim() == 2, func_name, ": Expected self to be of dimension 2 but got ", self.dim()); + TORCH_CHECK(mat2.dim() == 2, func_name, ": Expected mat2 to be of dimension 2 but got ", mat2.dim()); + TORCH_CHECK(self.size(1) == mat2.size(0), func_name, ": self.size(1) needs to match mat2.size(0) but got ", self.size(1), " and ", mat2.size(0)); + TORCH_CHECK(self.dtype() == at::kChar, func_name, ": Expected self dtype to be of type int8 but got ", self.dtype()); + TORCH_CHECK(mat2.dtype() == at::kChar, func_name, ": Expected mat2 dtype to be of type int8 but got ", mat2.dtype()); + TORCH_CHECK(result.dtype() == at::kInt, func_name, ": Expected result dtype to be of type kInt but got ", result.dtype()); + TORCH_CHECK(result.size(0) == self.size(0), func_name, ": Expected result.size(0) to be ", self.size(0), " but got ", result.size(0)); + TORCH_CHECK(result.size(1) == mat2.size(1), func_name, ": Expected result.size(1) to be ", mat2.size(1), " but got ", result.size(1)); + TORCH_CHECK(result.dim() == 2, func_name, ": Expected result to be of dimension 2 but got ", result.dim()); + TORCH_CHECK(result.is_contiguous(), func_name, ": Expected result to be contiguous."); + + if (result.numel() == 0 || self.size(1) == 0) { + return result.zero_(); + } + + bool dispatched = false; + if (at::globalContext().userEnabledMkldnn()) { + try { + mkldnn_matmul_i8i8i32(self, mat2, result); + dispatched = true; + } catch (const std::exception& e) { + TORCH_WARN(func_name, " failed, switching to BLAS gemm: ", e.what()); + } + } + if (!dispatched) { + auto a = reinterpret_cast(self.data_ptr()); + auto b = reinterpret_cast(mat2.data_ptr()); + auto c = reinterpret_cast(result.data_ptr()); + const int64_t m = result.size(0); + const int64_t n = result.size(1); + const int64_t k = self.size(1); + const int64_t lda_0 = self.strides()[0]; + const int64_t lda_1 = self.strides()[1]; + const int64_t ldb_0 = mat2.strides()[0]; + const int64_t ldb_1 = mat2.strides()[1]; + const int64_t ldc = result.strides()[0]; + parallel_for(0, m * n, 1, [&](int64_t start, int64_t end) { + for (const auto i : c10::irange(start, end)) { + auto row = i / n; + auto col = i % n; + c[row * ldc + col] = 0; + for (const auto k : c10::irange(k)) { + c[row * ldc + col] = c[row * ldc + col] + + static_cast(a[row * lda_0 + k * lda_1]) * + static_cast(b[k * ldb_0 + col * ldb_1]); + } + } + }); + } + return result; +} + +Tensor _int_mm_cpu(const Tensor& self, const Tensor& mat2) { + Tensor result = at::empty({self.size(0), mat2.size(1)}, self.options().dtype(at::kInt)); + return _int_mm_out_cpu(self, mat2, result); +} + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/LinearAlgebraUtils.h b/aten/src/ATen/native/LinearAlgebraUtils.h index 141caa5236825..0b05d5162e668 100644 --- a/aten/src/ATen/native/LinearAlgebraUtils.h +++ b/aten/src/ATen/native/LinearAlgebraUtils.h @@ -331,8 +331,7 @@ static inline std::tuple _linalg_broadcast_batch_dims(const Tenso linearSolveCheckInputs(arg1, arg2, name); } - std::vector arg1_expand_size, arg2_expand_size; - std::tie(arg1_expand_size, arg2_expand_size) = at::native::_linalg_broadcast_batch_dims(arg1, arg2); + auto [arg1_expand_size, arg2_expand_size] = at::native::_linalg_broadcast_batch_dims(arg1, arg2); auto arg1_broadcasted = arg1_expand_size == arg1.sizes() ? arg1 : arg1.expand(arg1_expand_size); auto arg2_broadcasted = arg2_expand_size == arg2.sizes() ? arg2 : arg2.expand(arg2_expand_size); diff --git a/aten/src/ATen/native/Loss.cpp b/aten/src/ATen/native/Loss.cpp index 0eafdf27648d2..e21d9f6008e8e 100644 --- a/aten/src/ATen/native/Loss.cpp +++ b/aten/src/ATen/native/Loss.cpp @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -268,31 +269,34 @@ Tensor& binary_cross_entropy_out_cpu(const Tensor& input, const Tensor& target, auto iter = TensorIteratorConfig() .add_output(loss_squeezed) - .add_owned_input(at::squeeze(input)) - .add_owned_input(at::squeeze(target)) + .add_owned_const_input(at::squeeze(input)) + .add_owned_const_input(at::squeeze(target)) .build(); - AT_DISPATCH_FLOATING_TYPES(loss.scalar_type(), "binary_cross_entropy", [&] { - at::native::cpu_kernel( - iter, - [] (scalar_t input_val, scalar_t target_val) { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + loss.scalar_type(), + "binary_cross_entropy", + [&] { + at::native::cpu_kernel( + iter, [](scalar_t input_val, scalar_t target_val) { TORCH_CHECK( (input_val >= 0) && (input_val <= 1), - "all elements of input should be between 0 and 1" - ); + "all elements of input should be between 0 and 1"); TORCH_CHECK( (target_val >= 0) && (target_val <= 1), - "all elements of target should be between 0 and 1" - ); + "all elements of target should be between 0 and 1"); // Binary cross entropy tensor is defined by the equation: // L = -w (y ln(x) + (1-y) ln(1-x)) - return (target_val - scalar_t(1)) - * std::max(scalar_t(std::log1p(-input_val)), scalar_t(-100)) - - target_val * std::max(scalar_t(std::log(input_val)), scalar_t(-100)); - } - ); - }); + return (target_val - scalar_t(1)) * + std::max(scalar_t(std::log1p(-input_val)), scalar_t(-100)) - + target_val * + std::max(scalar_t(std::log(input_val)), scalar_t(-100)); + }); + }); + if (weight.defined()) { loss.mul_(weight); } @@ -322,26 +326,30 @@ Tensor& binary_cross_entropy_backward_out_cpu(const Tensor& grad, const Tensor& auto iter = TensorIteratorConfig() .add_output(grad_input_squeezed) - .add_owned_input(at::squeeze(grad)) - .add_owned_input(at::squeeze(input)) - .add_owned_input(at::squeeze(target)) + .add_owned_const_input(at::squeeze(grad)) + .add_owned_const_input(at::squeeze(input)) + .add_owned_const_input(at::squeeze(target)) .build(); - AT_DISPATCH_FLOATING_TYPES(grad_input.scalar_type(), "binary_cross_entropy_backward", [&] { - at::native::cpu_kernel( - iter, - [] (scalar_t grad_val, scalar_t input_val, scalar_t target_val) { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::Half, + ScalarType::BFloat16, + grad_input.scalar_type(), + "binary_cross_entropy_backward", + [&] { + at::native::cpu_kernel( + iter, + [](scalar_t grad_val, scalar_t input_val, scalar_t target_val) { // The gradient is the partial derivative of BCELoss // with respect to x // d(L)/d(x) = -w (y - x) / (x - x^2) - return grad_val * (input_val - target_val) - / (scalar_t(std::max( + return grad_val * (input_val - target_val) / + (scalar_t(std::max( (scalar_t(1) - input_val) * input_val, - scalar_t(EPSILON) - ))); - } - ); - }); + scalar_t(EPSILON)))); + }); + }); + if (weight.defined()) { grad_input.mul_(weight); } @@ -358,21 +366,20 @@ Tensor binary_cross_entropy_with_logits(const Tensor& input, const Tensor& targe c10::MaybeOwned pos_weight_maybe_owned = at::borrow_from_optional_tensor(pos_weight_opt); const Tensor& pos_weight = *pos_weight_maybe_owned; - Tensor loss; - auto max_val = (-input).clamp_min_(0); - if (pos_weight.defined()) { - // pos_weight need to be broadcasted, thus mul(target) is not inplace. - auto log_weight = (pos_weight - 1).mul(target).add_(1); - loss = (1 - target).mul_(input).add_(log_weight.mul_(((-max_val).exp_().add_((-input - max_val).exp_())).log_().add_(max_val))); - } else { - loss = (1 - target).mul_(input).add_(max_val).add_((-max_val).exp_().add_((-input -max_val).exp_()).log_()); - } + auto log_sigmoid_input = at::log_sigmoid(input); + if (pos_weight.defined()) { + // pos_weight need to be broadcasted, thus mul(target) is not inplace. + auto log_weight = (pos_weight - 1).mul(target).add_(1); + log_sigmoid_input.mul_(log_weight); + } - if (weight.defined()) { - loss.mul_(weight); - } + Tensor loss = (1 - target).mul_(input).sub_(log_sigmoid_input); - return apply_loss_reduction(loss, reduction); + if (weight.defined()) { + loss.mul_(weight); + } + + return apply_loss_reduction(loss, reduction); } Tensor poisson_nll_loss(const Tensor& input, const Tensor& target, const bool log_input, const bool full, const double eps, const int64_t reduction) @@ -435,9 +442,9 @@ Tensor& smooth_l1_loss_backward_out(const Tensor& grad_output, const Tensor& inp auto norm = reduction == Reduction::Mean ? 1. / input.numel() : 1.; auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(target) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) .enforce_safe_casting_to_output(true) @@ -480,9 +487,9 @@ Tensor& huber_loss_backward_out(const Tensor& grad_output, const Tensor& input, auto norm = (reduction == Reduction::Mean) ? (1. / input.numel()) : 1.; auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(target) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) .build(); huber_backward_stub(iter.device_type(), iter, norm, delta); return grad_input; @@ -498,9 +505,9 @@ Tensor& mse_loss_backward_out(const Tensor& grad_output, auto norm = reduction == Reduction::Mean ? 2. / input.numel() : 2.; auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(target) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(target) + .add_const_input(grad_output) .build(); mse_backward_stub(iter.device_type(), iter, norm); return grad_input; diff --git a/aten/src/ATen/native/LossCTC.cpp b/aten/src/ATen/native/LossCTC.cpp index b6ad40b344b23..b13ed7e2ce921 100644 --- a/aten/src/ATen/native/LossCTC.cpp +++ b/aten/src/ATen/native/LossCTC.cpp @@ -77,6 +77,9 @@ std::tuple> ctc_loss_allocate_outpu if (targets.dim() == 1) { // concatenated targets int64_t pos = 0; for (const auto i : c10::irange(batch_size)) { + TORCH_CHECK(target_lengths[i] >= 0, + "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i], + " (while checking arguments for ", c, ")"); tg_batch_offsets[i] = pos; pos += target_lengths[i]; if (max_target_length < target_lengths[i]) @@ -89,6 +92,9 @@ std::tuple> ctc_loss_allocate_outpu // dim is 2 int64_t tg_batch_stride = targets.stride(0); for (const auto i : c10::irange(batch_size)) { + TORCH_CHECK(target_lengths[i] >= 0, + "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i], + " (while checking arguments for ", c, ")"); tg_batch_offsets[i] = i * tg_batch_stride; if (max_target_length < target_lengths[i]) max_target_length = target_lengths[i]; @@ -101,6 +107,9 @@ std::tuple> ctc_loss_allocate_outpu } int64_t max_input_length = log_probs.size(0); for (const auto b : c10::irange(batch_size)) { + TORCH_CHECK(input_lengths[b] >= 0, + "Expected input_lengths to have value at least ", 0, ", but got value ", input_lengths[b], + " (while checking arguments for ", c, ")"); TORCH_CHECK(input_lengths[b] <= max_input_length, "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b], " (while checking arguments for ", c, ")"); @@ -139,9 +148,9 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const int64_t batch_size = log_probs.size(1); auto lpp = log_probs.permute({1,0,2}); - auto log_probs_a_global = lpp.accessor(); + auto log_probs_a_global = lpp.accessor(); auto log_alpha_a_global = log_alpha.accessor(); - auto targets_data = targets.data_ptr(); + auto targets_data = targets.const_data_ptr(); auto neg_log_likelihood_a = neg_log_likelihood.accessor(); // alpha calculation for the first row, the three equations for alpha_1 above eq (6) @@ -155,6 +164,12 @@ std::tuple ctc_loss_cpu_template(const Tensor& log_probs, const auto log_alpha_a = log_alpha_a_global[b]; int64_t tg_batch_offset = tg_batch_offsets[b]; + if (input_length == 0) { + scalar_t log_likelihood = target_length == 0 ? 0 : neginf; + neg_log_likelihood_a[b] = -log_likelihood; + continue; + } + // the first two items of alpha_t above eq (6) log_alpha_a[0][0] = log_probs_a[0][BLANK]; if (target_length > 0) @@ -254,12 +269,13 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_ Tensor log_beta = at::empty_like(log_alpha, LEGACY_CONTIGUOUS_MEMORY_FORMAT); // could be optimized to use only 2 rows auto lpp = log_probs.permute({1,0,2}); - auto log_probs_a_global = lpp.accessor(); - auto log_alpha_a_global = log_alpha.accessor(); + auto log_probs_a_global = lpp.accessor(); + auto log_alpha_a_global = log_alpha.accessor(); auto log_beta_a_global = log_beta.accessor(); auto gp = grad.permute({1,0,2}); auto grad_a_global = gp.accessor(); - auto targets_data = targets.data_ptr(); + auto targets_data = targets.const_data_ptr(); + auto grad_out_a = grad_out.accessor(); auto create_fill_iterator = [](const Tensor& tensor, IntArrayRef squash_dims) { return TensorIteratorConfig() @@ -366,7 +382,7 @@ Tensor ctc_loss_backward_cpu_template(const Tensor& grad_out, const Tensor& log_ // now we wrap up the calculation by adding in the remaining items of eq (16) // this could be a great target for further vectorization. // grad is the output gradient, nll is the loss. Note that the likelihood -nll is the Z of eq (16) - scalar_t gr = grad_out.accessor()[b]; + scalar_t gr = grad_out_a[b]; for (const auto t : c10::irange(input_length)) { // or go for the full thing? for (const auto c : c10::irange(num_labels)) { scalar_t& res = grad_a[t][c]; @@ -422,8 +438,8 @@ std::tuple ctc_loss_tensor(const Tensor& log_probs, const Tensor Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous(); Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous(); - IntArrayRef il(ilc.data_ptr(), ilc.numel()); - IntArrayRef tl(tlc.data_ptr(), tlc.numel()); + IntArrayRef il(ilc.const_data_ptr(), ilc.numel()); + IntArrayRef tl(tlc.const_data_ptr(), tlc.numel()); return at::_ctc_loss(log_probs, targets, il, tl, BLANK, zero_infinity); } @@ -536,8 +552,8 @@ Tensor ctc_loss(const Tensor& log_probs, const Tensor& targets, const Tensor& in Tensor ilc = input_lengths.to(Device(at::kCPU), at::kLong).contiguous(); Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous(); - IntArrayRef il(ilc.data_ptr(), ilc.numel()); - IntArrayRef tl(tlc.data_ptr(), tlc.numel()); + IntArrayRef il(ilc.const_data_ptr(), ilc.numel()); + IntArrayRef tl(tlc.const_data_ptr(), tlc.numel()); return at::native::ctc_loss(log_probs, targets, il, tl, BLANK, reduction, zero_infinity); } diff --git a/aten/src/ATen/native/LossMultiLabelMargin.cpp b/aten/src/ATen/native/LossMultiLabelMargin.cpp index f87c0755f5e12..58ca609eaed54 100644 --- a/aten/src/ATen/native/LossMultiLabelMargin.cpp +++ b/aten/src/ATen/native/LossMultiLabelMargin.cpp @@ -24,8 +24,8 @@ namespace { template inline scalar_t multilabel_margin_loss_forward_inner_sum_cpu( - scalar_t* input_data, - int64_t* target_data, + const scalar_t* input_data, + const int64_t* target_data, scalar_t* is_target_data, int64_t dim) { using accscalar_t = at::acc_type; @@ -67,8 +67,8 @@ static void multilabel_margin_loss_forward_out_frame( int64_t nframe, int64_t dim) { using accscalar_t = at::acc_type; - scalar_t* input_data = input_contiguous.data_ptr(); - int64_t* target_data = target_contiguous.data_ptr(); + const scalar_t* input_data = input_contiguous.const_data_ptr(); + const int64_t* target_data = target_contiguous.const_data_ptr(); scalar_t* is_target_data = is_target.data_ptr(); if (reduction != Reduction::None || output.dim() == 0) { @@ -168,9 +168,9 @@ static void multilabel_margin_loss_backward_out_frame( TORCH_CHECK( is_target_contiguous.max().item() <= 1, is_target_arg, " is out of range"); - scalar_t* input_data = input_contiguous.data_ptr(); - int64_t* target_data = target_contiguous.data_ptr(); - scalar_t* is_target_data = is_target_contiguous.data_ptr(); + const scalar_t* input_data = input_contiguous.const_data_ptr(); + const int64_t* target_data = target_contiguous.const_data_ptr(); + const scalar_t* is_target_data = is_target_contiguous.const_data_ptr(); scalar_t g = static_cast( // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) reduction == Reduction::Mean ? 1. / (nframe * dim) : 1. / dim); @@ -204,13 +204,13 @@ static void multilabel_margin_loss_backward_out_frame( if (reduction != Reduction::None || grad_output.dim() == 0) { assert( reduction != Reduction::None || grad_output.dim() > 0 || nframe == 1); - const auto d = *grad_output.data_ptr(); + const auto d = *grad_output.const_data_ptr(); for (int64_t t = 0; t < nframe * dim; t++) { grad_input_data[t] *= d; } } else { check_dim_size(grad_output, 1, 0, nframe); - auto grad_output_acc = grad_output.accessor(); + auto grad_output_acc = grad_output.accessor(); for (const auto t : c10::irange(nframe)) { for (const auto d : c10::irange(dim)) { grad_input_data[t * dim + d] *= grad_output_acc[t]; diff --git a/aten/src/ATen/native/LossMultiMargin.cpp b/aten/src/ATen/native/LossMultiMargin.cpp index 32495aab10fc6..5b2f5ae1863b7 100644 --- a/aten/src/ATen/native/LossMultiMargin.cpp +++ b/aten/src/ATen/native/LossMultiMargin.cpp @@ -59,11 +59,11 @@ inline int64_t target_index_checked( template static inline void multi_margin_loss_cpu_kernel( Tensor& output, - scalar_t* input_data, - int64_t* target_data, + const scalar_t* input_data, + const int64_t* target_data, const int p, scalar_t margin, - scalar_t* weight_data, + const scalar_t* weight_data, const int64_t nframe, const int64_t dim, const int64_t reduction) { @@ -131,10 +131,10 @@ void multi_margin_loss_out_cpu_template( AT_DISPATCH_FLOATING_TYPES( input.scalar_type(), "multi_margin_loss_cpu_kernel", [&] { - auto input_data = input_contiguous.data_ptr(); - auto target_data = target_contiguous.data_ptr(); + auto input_data = input_contiguous.const_data_ptr(); + auto target_data = target_contiguous.const_data_ptr(); auto weight_data = - weight_contiguous.defined() ? weight_contiguous.data_ptr() : nullptr; + weight_contiguous.defined() ? weight_contiguous.const_data_ptr() : nullptr; multi_margin_loss_cpu_kernel( output, input_data, @@ -152,12 +152,12 @@ template static void multi_margin_loss_backward_cpu_kernel( scalar_t* grad_input_data, const Tensor& grad_output, - scalar_t* input_data, - int64_t* target_data, + const scalar_t* input_data, + const int64_t* target_data, int p, scalar_t margin, scalar_t g, - scalar_t* weight_data, + const scalar_t* weight_data, int64_t nframe, int64_t dim, int64_t reduction) { @@ -193,12 +193,12 @@ static void multi_margin_loss_backward_cpu_kernel( assert( reduction != Reduction::None || grad_output.dim() > 0 || nframe == 1); // check 1d scalar fallback-case - const auto d = *grad_output.data_ptr(); + const auto d = *grad_output.const_data_ptr(); for (int64_t t = 0; t < nframe * dim; t++) { grad_input_data[t] *= d; } } else { - auto grad_output_acc = grad_output.accessor(); + auto grad_output_acc = grad_output.accessor(); for (const auto t : c10::irange(nframe)) { for (const auto d : c10::irange(dim)) { grad_input_data[t * dim + d] *= grad_output_acc[t]; @@ -236,10 +236,10 @@ void multi_margin_loss_backward_out_cpu_template( AT_DISPATCH_FLOATING_TYPES( input.scalar_type(), "multi_margin_loss_backward_cpu_kernel", [&] { auto grad_input_data = grad_input.mutable_data_ptr(); - auto input_data = input_contiguous.data_ptr(); - auto target_data = target_contiguous.data_ptr(); + auto input_data = input_contiguous.const_data_ptr(); + auto target_data = target_contiguous.const_data_ptr(); auto weight_data = weight_contiguous.defined() - ? weight_contiguous.data_ptr() + ? weight_contiguous.const_data_ptr() : nullptr; scalar_t g = reduction == Reduction::Mean ? static_cast(1. / (nframe * dim)) diff --git a/aten/src/ATen/native/LossNLL.cpp b/aten/src/ATen/native/LossNLL.cpp index 86d74e8fec012..0e7de9c27252a 100644 --- a/aten/src/ATen/native/LossNLL.cpp +++ b/aten/src/ATen/native/LossNLL.cpp @@ -147,7 +147,11 @@ inline Tensor optional_contiguous(const Tensor& source) { // or nullptr if the tensor is undefined. template inline scalar_t* optional_data(const Tensor& source) { - return source.defined() ? source.data_ptr() : nullptr; + if constexpr (std::is_const::value) { + return source.defined() ? source.const_data_ptr() : nullptr; + } else { + return source.defined() ? source.data_ptr() : nullptr; + } } template @@ -166,14 +170,14 @@ static void nll_loss_out_frame( *total_weight_data = 0; auto weight_contiguous = optional_contiguous(weight); - const scalar_t* weight_data = optional_data(weight_contiguous); + const scalar_t* weight_data = optional_data(weight_contiguous); if (reduction == Reduction::None && n_dims == 2) { const auto batch_size = input.size(0); at::native::resize_output(output, {batch_size}); - auto input_acc = input.accessor(); - auto target_acc = target.accessor(); + auto input_acc = input.accessor(); + auto target_acc = target.accessor(); auto output_acc = output.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { @@ -219,8 +223,8 @@ static void nll_loss_out_frame( auto input_contiguous = input.contiguous(); auto target_contiguous = target.contiguous(); - const scalar_t* input_data = input_contiguous.data_ptr(); - const target_t* target_data = target_contiguous.data_ptr(); + const scalar_t* input_data = input_contiguous.const_data_ptr(); + const target_t* target_data = target_contiguous.const_data_ptr(); const int64_t ndim = input.dim(); const int64_t batch_size = ndim == 1 ? 1 : input.size(0); @@ -300,8 +304,12 @@ void nll_loss_forward_out_cpu_template( const Tensor& weight, int64_t reduction, int64_t ignore_index) { - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, input.scalar_type(), "nll_loss_out_frame", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + ScalarType::BFloat16, + ScalarType::Half, + input.scalar_type(), + "nll_loss_out_frame", + [&] { if (target.scalar_type() == kByte) { nll_loss_out_frame( output, @@ -342,15 +350,15 @@ static void nll_loss_backward_out_frame( if (target.dim() == 0) { target_ = target.unsqueeze(0); } - auto target_acc = target_.accessor(); + auto target_acc = target_.accessor(); auto weight_contiguous = optional_contiguous(weight); - const scalar_t* weight_data = optional_data(weight_contiguous); + const scalar_t* weight_data = optional_data(weight_contiguous); if (reduction == Reduction::None && n_dims == 2) { const auto batch_size = input.size(0); auto grad_input_acc = grad_input.accessor(); - auto grad_output_acc = grad_output.accessor(); + auto grad_output_acc = grad_output.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { for (const auto i : c10::irange(start, end)) { auto cur_target = target_acc[i]; @@ -365,9 +373,9 @@ static void nll_loss_backward_out_frame( return; } - const scalar_t total_weight_value = *total_weight.data_ptr(); + const scalar_t total_weight_value = *total_weight.const_data_ptr(); - const scalar_t grad_output_value = *grad_output.data_ptr(); + const scalar_t grad_output_value = *grad_output.const_data_ptr(); if (input.dim() == 1) { auto grad_input_acc = grad_input.accessor(); @@ -411,8 +419,9 @@ void nll_loss_backward_out_cpu_template( const Tensor& total_weight) { grad_input.zero_(); - AT_DISPATCH_FLOATING_TYPES_AND( + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::BFloat16, + ScalarType::Half, input.scalar_type(), "nll_loss_backward_out_frame", [&] { @@ -720,12 +729,12 @@ Tensor nll_loss_nd_symint( input_ = input_.contiguous(); target_ = target_.contiguous(); // support empty batches, see #15870 - if (input_.numel() > 0) { + if (input_.sym_numel() > 0) { input_ = input_.view_symint({n, std::move(c), 1, -1}); } else { input_ = input_.view_symint({n, std::move(c), 0, 0}); } - if (target_.numel() > 0) { + if (target_.sym_numel() > 0) { target_ = target_.view_symint({std::move(n), 1, -1}); } else { target_ = target_.view_symint({std::move(n), 0, 0}); diff --git a/aten/src/ATen/native/LossNLL2d.cpp b/aten/src/ATen/native/LossNLL2d.cpp index 2d210901efc39..94c667dcb1b2b 100644 --- a/aten/src/ATen/native/LossNLL2d.cpp +++ b/aten/src/ATen/native/LossNLL2d.cpp @@ -35,7 +35,11 @@ inline Tensor optional_contiguous(const Tensor& source) { // or nullptr if the tensor is undefined. template inline scalar_t* optional_data(const Tensor& source) { - return source.defined() ? source.data_ptr() : nullptr; + if constexpr (std::is_const::value) { + return source.defined() ? source.const_data_ptr() : nullptr; + } else { + return source.defined() ? source.data_ptr() : nullptr; + } } inline void check_inputs_nll_loss2d( @@ -109,7 +113,7 @@ static void nll_loss2d_forward_out_frame( *total_weight_data = 0; auto weight_contiguous = optional_contiguous(weight); - const scalar_t* weight_data = optional_data(weight_contiguous); + const scalar_t* weight_data = optional_data(weight_contiguous); if (reduction == Reduction::None) { const int64_t batch_size = input.size(0); @@ -117,9 +121,9 @@ static void nll_loss2d_forward_out_frame( const int64_t W = input.size(3); at::native::resize_output(output, {batch_size, H, W}); - auto input_acc = input.accessor(); + auto input_acc = input.accessor(); auto output_acc = output.accessor(); - auto target_acc = target.accessor(); + auto target_acc = target.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { for (const auto b : c10::irange(start, end)) { @@ -170,8 +174,8 @@ static void nll_loss2d_forward_out_frame( auto input_contiguous = input.contiguous(); auto target_contiguous = target.contiguous(); - const scalar_t* input_data = input_contiguous.data_ptr(); - const int64_t* target_data = target_contiguous.data_ptr(); + const scalar_t* input_data = input_contiguous.const_data_ptr(); + const int64_t* target_data = target_contiguous.const_data_ptr(); const int64_t batch_size = input.size(0); const int64_t map_size = input.size(2) * input.size(3); @@ -258,8 +262,9 @@ void nll_loss2d_forward_out_cpu_template( check_inputs_nll_loss2d(input, target, weight); total_weight.resize_({}); - AT_DISPATCH_FLOATING_TYPES_AND( + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::BFloat16, + ScalarType::Half, input.scalar_type(), "nll_loss2d_forward_out_frame", [&] { @@ -285,7 +290,7 @@ static void nll_loss2d_backward_out_frame( int64_t ignore_index, const Tensor& total_weight) { auto weight_contiguous = optional_contiguous(weight); - const scalar_t* weight_data = optional_data(weight_contiguous); + const scalar_t* weight_data = optional_data(weight_contiguous); if (reduction == at::Reduction::None) { check_gradout_shape_nll_loss2d(grad_output, target); @@ -295,8 +300,8 @@ static void nll_loss2d_backward_out_frame( const int64_t W = input.size(3); auto grad_input_acc = grad_input.accessor(); - auto grad_output_acc = grad_output.accessor(); - auto target_acc = target.accessor(); + auto grad_output_acc = grad_output.accessor(); + auto target_acc = target.accessor(); at::parallel_for(0, batch_size, 0, [&](int64_t start, int64_t end) { for (const auto b : c10::irange(start, end)) { @@ -319,17 +324,17 @@ static void nll_loss2d_backward_out_frame( return; } - const scalar_t total_weight_value = *total_weight.data_ptr(); + const scalar_t total_weight_value = *total_weight.const_data_ptr(); TORCH_CHECK( grad_output.dim() <= 1 && grad_output.numel() == 1, "Expected a single element grad_output tensor, but got: ", grad_output.sizes()); - const scalar_t grad_output_value = *grad_output.data_ptr(); + const scalar_t grad_output_value = *grad_output.const_data_ptr(); const auto target_contiguous = target.contiguous(); - const int64_t* target_data = target_contiguous.data_ptr(); + const int64_t* target_data = target_contiguous.const_data_ptr(); scalar_t* grad_input_data = grad_input.mutable_data_ptr(); @@ -379,8 +384,9 @@ void nll_loss2d_backward_out_cpu_template( total_weight.numel(), " elements)"); - AT_DISPATCH_FLOATING_TYPES_AND( + AT_DISPATCH_FLOATING_TYPES_AND2( ScalarType::BFloat16, + ScalarType::Half, input.scalar_type(), "nll_loss2d_backward_out_frame", [&] { diff --git a/aten/src/ATen/native/Math.h b/aten/src/ATen/native/Math.h index f944a518ed63f..092ee00992e9d 100644 --- a/aten/src/ATen/native/Math.h +++ b/aten/src/ATen/native/Math.h @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp index de0806923a32f..fbac5d4cc72c2 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose2d.cpp @@ -298,13 +298,16 @@ void slow_conv_transpose2d_out_cpu_template( } columns.zero_(); + // Materialize if COW, since we cannot do so during parallel_for + output.mutable_data_ptr(); + AT_DISPATCH_FLOATING_TYPES_AND3(at::ScalarType::Long, at::ScalarType::BFloat16, at::ScalarType::Half, input.scalar_type(), "slow_conv_transpose2d_out_cpu", [&] { at::parallel_for(0, batch_size, 0, [&](int64_t begin, int64_t end) { // For each elt in batch, do: for (const auto elt : c10::irange(begin, end)) { - // Matrix mulitply per output: + // Matrix multiply per output: Tensor input_n = input_.select(0, elt); Tensor output_n = output.select(0, elt); Tensor columns_n = columns.select(0, elt); @@ -353,7 +356,7 @@ void slow_conv_transpose2d_out_cpu_template( // Unpack columns back into input: col2im( - columns_n.data_ptr(), + columns_n.const_data_ptr(), n_output_plane, output_height, output_width, @@ -501,14 +504,14 @@ static void slow_conv_transpose2d_backward_out_cpu_template( // For each elt in batch, do: for (const auto elt : c10::irange(batch_size)) { - // Matrix mulitply per sample: + // Matrix multiply per sample: grad_input_n = grad_input.select(0, elt); grad_output_n = grad_output.select(0, elt); if (need_columns) { // Extract columns: im2col( - grad_output_n.data_ptr(), + grad_output_n.const_data_ptr(), n_output_plane, output_height, output_width, @@ -526,8 +529,8 @@ static void slow_conv_transpose2d_backward_out_cpu_template( use_channels_last); } - auto gemm_in_ptr = need_columns ? grad_columns.data_ptr() - : grad_output_n.data_ptr(); + auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr() + : grad_output_n.const_data_ptr(); if (use_channels_last) { int64_t m = n_input_plane; @@ -695,18 +698,18 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu( // For each elt in batch, do: for (const auto elt : c10::irange(batch_size)) { - // Matrix mulitply per output: + // Matrix multiply per output: grad_output_n = grad_output.select(0, elt); // Do Weight: if (grad_weight.defined()) { - // Matrix mulitply per output: + // Matrix multiply per output: input_n = input.select(0, elt); if (need_columns) { // Extract columns: im2col( - grad_output_n.data_ptr(), + grad_output_n.const_data_ptr(), n_output_plane, output_height, output_width, @@ -724,8 +727,8 @@ void slow_conv_transpose2d_acc_grad_parameters_cpu( use_channels_last); } - auto gemm_in_ptr = need_columns ? columns.data_ptr() - : grad_output_n.data_ptr(); + auto gemm_in_ptr = need_columns ? columns.const_data_ptr() + : grad_output_n.const_data_ptr(); if (use_channels_last) { int64_t m = kernel_height * kernel_width * n_output_plane; diff --git a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp index a9f02117dd802..624e820c7ba66 100644 --- a/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp +++ b/aten/src/ATen/native/NaiveConvolutionTranspose3d.cpp @@ -329,7 +329,7 @@ void slow_conv_transpose3d_out_cpu_template( // Unpack columns back into input: at::native::col2vol( - columns.data_ptr(), + columns.const_data_ptr(), n_output_plane, output_depth, output_height, @@ -562,8 +562,8 @@ void slow_conv_transpose3d_backward_out_cpu_template( // Do GEMM (note: this is a bit confusing because gemm assumes // column-major matrices) - auto gemm_in_ptr = need_columns ? grad_columns.data_ptr() - : grad_output_n.data_ptr(); + auto gemm_in_ptr = need_columns ? grad_columns.const_data_ptr() + : grad_output_n.const_data_ptr(); cpublas::gemm( TransposeType::NoTranspose, TransposeType::NoTranspose, @@ -782,8 +782,8 @@ void slow_conv_transpose3d_acc_grad_parameters_cpu( // Do GEMM (note: this is a bit confusing because gemm assumes // column-major matrices) - auto gemm_in_ptr = need_columns ? columns.data_ptr() - : grad_output_n.data_ptr(); + auto gemm_in_ptr = need_columns ? columns.const_data_ptr() + : grad_output_n.const_data_ptr(); cpublas::gemm( TransposeType::Transpose, TransposeType::NoTranspose, diff --git a/aten/src/ATen/native/Normalization.cpp b/aten/src/ATen/native/Normalization.cpp index 4bc7dbc139b68..93d2ce11d934f 100644 --- a/aten/src/ATen/native/Normalization.cpp +++ b/aten/src/ATen/native/Normalization.cpp @@ -29,6 +29,11 @@ #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -199,6 +204,7 @@ std::tuple batch_norm_cpu_update_stats_template( using accscalar_t = at::acc_type; int64_t n_input = input.size(1); + TORCH_CHECK(input.numel() != 0, "input tensor must have at least one element, but got input_sizes = ", input.sizes()); int64_t n = input.numel() / n_input; bool all_contiguous = is_contiguous(input); @@ -332,18 +338,18 @@ std::tuple batch_norm_backward_cpu_template( return std::make_tuple(grad_input, grad_weight, grad_bias); } - auto weight_a = conditional_accessor_1d(weight); + auto weight_a = conditional_accessor_1d(weight); auto grad_weight_a = conditional_accessor_1d(grad_weight); auto grad_bias_a = conditional_accessor_1d(grad_bias); int64_t n_input = input.size(1); int64_t n = input.numel() / n_input; - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); const int64_t ndim = input.dim(); @@ -358,8 +364,8 @@ std::tuple batch_norm_backward_cpu_template( auto sum_a = sum.accessor(); auto reduce_iter = TensorIteratorConfig() - .add_input(input) - .add_input(grad_out_) + .add_const_input(input) + .add_const_input(grad_out_) .resize_outputs(false) .declare_static_shape(input.sizes(), /*squash_dims=*/1) .build(); @@ -370,7 +376,7 @@ std::tuple batch_norm_backward_cpu_template( unary_iter.build( TensorIteratorConfig() .add_output(grad_input) - .add_input(train ? input : grad_out_) + .add_const_input(train ? input : grad_out_) .resize_outputs(false) .declare_static_shape(input.sizes(), /*squash_dims=*/1)); @@ -379,18 +385,18 @@ std::tuple batch_norm_backward_cpu_template( TensorIteratorConfig() .add_output(grad_input) .add_input(grad_input) - .add_input(grad_out_) + .add_const_input(grad_out_) .resize_outputs(false) .declare_static_shape(input.sizes(), /*squash_dims=*/1)); } } auto in_channel_stride = input.strides()[1]; - auto in_data = input.data_ptr(); + auto in_data = input.const_data_ptr(); auto grad_in_channel_stride = grad_input_mask[0] ? grad_input.strides()[1] : 0; auto grad_in_data = grad_input_mask[0] ? grad_input.mutable_data_ptr() : nullptr; auto grad_out_channel_stride = grad_out_.strides()[1]; - auto grad_out_data = grad_out_.data_ptr(); + auto grad_out_data = grad_out_.const_data_ptr(); parallel_for(0, n_input, 1, [&](int64_t b_begin, int64_t b_end) { TensorIterator reduce_iter_local(reduce_iter); @@ -409,12 +415,12 @@ std::tuple batch_norm_backward_cpu_template( invstd = 1 / std::sqrt(running_var_a[f] + eps); } - // dot product of the Q(X) and gradOuput + // dot product of the Q(X) and gradOutput accscalar_t dotp = 0; reduce_iter_local.unsafe_replace_operand( - 0, in_data + f * in_channel_stride); + 0, const_cast(in_data + f * in_channel_stride)); reduce_iter_local.unsafe_replace_operand( - 1, grad_out_data + f * grad_out_channel_stride); + 1, const_cast(grad_out_data + f * grad_out_channel_stride)); cpu_serial_kernel(reduce_iter_local, [&](const scalar_t i, const scalar_t go) -> void { dotp += (i - mean) * go; @@ -433,7 +439,7 @@ std::tuple batch_norm_backward_cpu_template( unary_iter_local.unsafe_replace_operand( 0, grad_in_data + f * grad_in_channel_stride); unary_iter_local.unsafe_replace_operand( - 1, in_data + f * in_channel_stride); + 1, const_cast(in_data + f * in_channel_stride)); cpu_serial_kernel(unary_iter_local, [&](const scalar_t i) -> scalar_t { return (i - mean) * k; }); @@ -445,7 +451,7 @@ std::tuple batch_norm_backward_cpu_template( binary_iter_local.unsafe_replace_operand(0, gI_data); binary_iter_local.unsafe_replace_operand(1, gI_data); binary_iter_local.unsafe_replace_operand( - 2, grad_out_data + f * grad_out_channel_stride); + 2, const_cast(grad_out_data + f * grad_out_channel_stride)); cpu_serial_kernel(binary_iter_local, [&](scalar_t gi, scalar_t go) -> scalar_t { return (go - grad_mean - gi) * invstd * w; }); @@ -459,7 +465,7 @@ std::tuple batch_norm_backward_cpu_template( unary_iter_local.unsafe_replace_operand( 0, grad_in_data + f * grad_in_channel_stride); unary_iter_local.unsafe_replace_operand( - 1, grad_out_data + f * grad_out_channel_stride); + 1, const_cast(grad_out_data + f * grad_out_channel_stride)); cpu_serial_kernel(unary_iter_local, [&](const scalar_t i) -> scalar_t { return i * invstd * w; }); @@ -478,10 +484,58 @@ std::tuple batch_norm_backward_cpu_template( return std::make_tuple(grad_input, grad_weight, grad_bias); } +BatchNormBackend _select_batch_norm_backend( + const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, + const Tensor& running_var, bool training, double eps) { + + auto& ctx = at::globalContext(); + bool cudnn_enabled = ctx.userEnabledCuDNN(); + + if ( + input.is_cuda() + && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16 + && (input.scalar_type() != at::kHalf + || weight.scalar_type() == at::kFloat) + && weight.defined() && bias.defined() + && ((running_mean.defined() && running_var.defined()) + || (!running_mean.defined() && !running_var.defined() && training)) + && (input.dim() >= 3) + && ((input.sym_size(0) <= 880801 && training) // spatial, training + ||(input.sym_size(0) <= 65535 && !training)) //spatial, eval + && detail::getCUDAHooks().compiledWithCuDNN() + && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN() + && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L + && input.sym_numel() < std::numeric_limits::max() // some cuDNN kernels have 32-bit indexing limitations + ) { + return BatchNormBackend::Cudnn; + } + + if ( + input.is_cuda() + && input.dim() <= MIOPEN_DIM_MAX + && input.scalar_type() != at::kDouble + && input.scalar_type() != at::kBFloat16 + && (weight.scalar_type() != at::kHalf) + && weight.defined() && bias.defined() + && ((running_mean.defined() && running_var.defined()) + || (!running_mean.defined() && !running_var.defined() && training)) + && detail::getCUDAHooks().compiledWithMIOpen() + && cudnn_enabled + && input.suggest_memory_format() != MemoryFormat::ChannelsLast + && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d + ) { + return BatchNormBackend::Miopen; + } + + return BatchNormBackend::Native; +} + + // _batch_norm_impl_index(_backward) are used in the JIT be able to keep the run-time selection // of backends, while enabling it to keep the information about the used backend, so that it can // use its corresponding backward implementation. // XXX: The indices of backends need to be kept synchronized between this function and its _backward. +// TODO: remove cudnn_enabled arg std::tuple _batch_norm_impl_index( const Tensor& input, const c10::optional& weight_opt /* optional */, const c10::optional& bias_opt /* optional */, const c10::optional& running_mean_opt /* optional */, const c10::optional& running_var_opt /* optional */, bool training, double momentum, double eps, bool cudnn_enabled) { @@ -526,32 +580,16 @@ std::tuple _batch_norm_impl_index( check_dims_match_num_input_features("bias", std::move(num_features), bias.sym_numel()); } - const bool use_cudnn = ( - input.is_cuda() - && input.scalar_type() != at::kBFloat16 && weight.scalar_type() != at::kBFloat16 - && (input.scalar_type() != at::kHalf - || weight.scalar_type() == at::kFloat) - && weight.defined() && bias.defined() - && ((running_mean.defined() && running_var.defined()) - || (!running_mean.defined() && !running_var.defined() && training)) - && (input.dim() >= 3) - && ((input.sym_size(0) <= 880801 && training) // spatial, training - ||(input.sym_size(0) <= 65535 && !training)) //spatial, eval - && detail::getCUDAHooks().compiledWithCuDNN() - && eps >= detail::getCUDAHooks().batchnormMinEpsilonCuDNN() - && cudnn_enabled && detail::getCUDAHooks().versionCuDNN() >= 5110L - && input.sym_numel() < std::numeric_limits::max() // some cuDNN kernels have 32-bit indexing limitations - ); + BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps); - if (use_cudnn) { + if (backend == BatchNormBackend::Cudnn) { auto input_c = input.contiguous(input.suggest_memory_format()); auto weight_c = weight.contiguous(); auto bias_c = bias.contiguous(); auto rmean_c = running_mean.defined() ? running_mean.contiguous() : running_mean; auto rvar_c = running_var.defined() ? running_var.contiguous() : running_var; - Tensor output, save_mean, save_var, reserve; - std::tie(output, save_mean, save_var, reserve) = + auto [output, save_mean, save_var, reserve] = at::cudnn_batch_norm(input_c, weight_c, bias_c, rmean_c, rvar_c, training, momentum, eps); @@ -561,19 +599,7 @@ std::tuple _batch_norm_impl_index( Tensor reserve = at::empty({0}, input.options().dtype(kByte)); - bool use_miopen = (input.is_cuda() - && input.dim() <= MIOPEN_DIM_MAX - && input.scalar_type() != at::kDouble - && input.scalar_type() != at::kBFloat16 - && (weight.scalar_type() != at::kHalf) - && weight.defined() && bias.defined() - && ((running_mean.defined() && running_var.defined()) - || (!running_mean.defined() && !running_var.defined() && training)) - && detail::getCUDAHooks().compiledWithMIOpen() - && cudnn_enabled - ); - - if (use_miopen && input.suggest_memory_format() != MemoryFormat::ChannelsLast && input.suggest_memory_format() != MemoryFormat::ChannelsLast3d) { + if (backend == BatchNormBackend::Miopen) { return std::tuple_cat( at::miopen_batch_norm( input.contiguous(), weight.contiguous(), bias.contiguous(), @@ -637,6 +663,7 @@ std::tuple _batch_norm_impl_index_backward( TORCH_INTERNAL_ASSERT(false, "Unsupported impl_index in _batch_norm_impl_index_backward: ", impl_index); } +// TODO: remove cudnn_enabled arg Tensor batch_norm( const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, @@ -647,6 +674,30 @@ Tensor batch_norm( const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); return std::get<0>(at::_batch_norm_impl_index(input, weight, bias, running_mean, running_var, training, momentum, eps, cudnn_enabled)); + // TODO: switch to the new stack after the 2 week FC window + // if (training) { + // BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, training, eps); + // if (backend == BatchNormBackend::Cudnn || backend == BatchNormBackend::Miopen) { + // auto input_c = input; + // if (backend == BatchNormBackend::Cudnn) { + // input_c = input.contiguous(input.suggest_memory_format()); + // } else { + // input_c = input.contiguous(); + // } + // auto weight_c = weight.contiguous(); + // auto bias_c = bias.contiguous(); + // auto rmean_c = running_mean.defined() ? running_mean.contiguous() : running_mean; + // auto rvar_c = running_var.defined() ? running_var.contiguous() : running_var; + // return std::get<0>(at::_batch_norm_with_update(input_c, weight_c, bias_c, const_cast(rmean_c), + // const_cast(rvar_c), momentum, eps)); + // } else { + // return std::get<0>(at::_batch_norm_with_update(input, weight, bias, const_cast(running_mean), + // const_cast(running_var), momentum, eps)); + // } + // } else { + // return std::get<0>(at::_batch_norm_no_update(input, weight, bias, running_mean, running_var, + // momentum, eps)); + // } } Tensor instance_norm( @@ -798,6 +849,38 @@ std::tuple batch_norm_cpu(const Tensor& self, const c10: return batch_norm_cpu_out(self, weight_opt, bias_opt, running_mean_opt, running_var_opt, train, momentum, eps, output, save_mean, save_var); } +std::tuple _batch_norm_with_update_cpu( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps) { + Tensor output, save_mean, save_var; + std::tie(output, save_mean, save_var) = + batch_norm_cpu(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps); + Tensor reserve = at::empty({0}, input.options().dtype(kByte)); + return std::tuple(output, save_mean, save_var, reserve); +} + +std::tuple _batch_norm_with_update_cpu_out( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps, + Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) { + std::tie(out, save_mean, save_var) = + batch_norm_cpu_out(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps, out, save_mean, save_var); + return std::tuple(out, save_mean, save_var, reserve); +} + + +std::tuple _batch_norm_no_update( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + double momentum, double eps) { + const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();}); + const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); + Tensor output, save_mean, save_var; + std::tie(output, save_mean, save_var) = + batch_norm_cpu(input, weight_opt, bias_opt, const_cast(running_mean), const_cast(running_var), /*update*/false, momentum, eps); + Tensor reserve = at::empty({0}, input.options().dtype(kByte)); + return std::tuple(output, save_mean, save_var, reserve); +} std::tuple _batch_norm_legit_cpu( const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -826,6 +909,13 @@ std::tuple _batch_norm_legit_no_stats_cpu_out(const T return batch_norm_cpu_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, eps, out, save_mean, save_var); } +std::tuple _new_batch_norm_backward_cpu( + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { + return batch_norm_backward_cpu(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask); +} std::tuple batch_norm_backward_cpu(const Tensor& grad_out, const Tensor& self, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double eps, std::array grad_input_mask) { diff --git a/aten/src/ATen/native/Normalization.h b/aten/src/ATen/native/Normalization.h index 6cd4dcde37052..1ba99e77b65c8 100644 --- a/aten/src/ATen/native/Normalization.h +++ b/aten/src/ATen/native/Normalization.h @@ -8,4 +8,12 @@ namespace at::native { using renorm_scale_factor_fn = void (*) (TensorIteratorBase& iter, double maxnorm); DECLARE_DISPATCH(renorm_scale_factor_fn, renorm_scale_factor_stub); +enum class BatchNormBackend { + Native, + Cudnn, + Miopen, +}; + +TORCH_API BatchNormBackend _select_batch_norm_backend(const Tensor& input, const Tensor& weight, const Tensor& bias, const Tensor& running_mean, const Tensor& running_var, bool training, double eps); + } // namespace at::native diff --git a/aten/src/ATen/native/Onehot.cpp b/aten/src/ATen/native/Onehot.cpp index 91c3c8f1611e4..97c35599f791c 100644 --- a/aten/src/ATen/native/Onehot.cpp +++ b/aten/src/ATen/native/Onehot.cpp @@ -42,15 +42,17 @@ Tensor one_hot(const Tensor &self, int64_t num_classes) { } // non-empty tensor - if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) { - //for cuda, rely on device assert thrown by scatter + if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS && + self.device().type() != at::kPrivateUse1) { + // for cuda, rely on device assert thrown by scatter TORCH_CHECK(self.min().item().toLong() >= 0, "Class values must be non-negative."); } if (num_classes == -1) { num_classes = self.max().item().toLong() + 1; } else { - if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS) { - //rely on device asserts from scatter to avoid sync here + if (self.device().type() != at::kCUDA && self.device().type() != at::kMPS && + self.device().type() != at::kPrivateUse1) { + // rely on device asserts from scatter to avoid sync here TORCH_CHECK(num_classes > self.max().item().toLong(), "Class values must be smaller than num_classes."); } else { //for cuda, assert that num_classes is at least 1 diff --git a/aten/src/ATen/native/Pool.h b/aten/src/ATen/native/Pool.h index 33a733273a80a..07940729fda8c 100644 --- a/aten/src/ATen/native/Pool.h +++ b/aten/src/ATen/native/Pool.h @@ -26,6 +26,19 @@ using avg_pool2d_backward_fn = void(*)(const Tensor& output, const Tensor& input DECLARE_DISPATCH(avg_pool2d_fn, avg_pool2d_kernel); DECLARE_DISPATCH(avg_pool2d_backward_fn, avg_pool2d_backward_kernel); +// averge pooling has same signature for forward and backward +using avg_pool3d_fn = void(*)(const Tensor& output, const Tensor& input, + int64_t kW, int64_t kH, int64_t kD, int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, bool count_include_pad, + c10::optional divisor_override); +using avg_pool3d_backward_fn = void(*)(const Tensor& output, const Tensor& input, + int kW, int kH, int kD, int dW, int dH, int dD, + int padW, int padH, int padD, bool count_include_pad, + c10::optional divisor_override); + +DECLARE_DISPATCH(avg_pool3d_fn, avg_pool3d_kernel); +DECLARE_DISPATCH(avg_pool3d_backward_fn, avg_pool3d_backward_kernel); + using max_pool3d_fn = void(*)(Tensor& output, Tensor& indices, const Tensor& input, int kW, int kH, int kD, int dW, int dH, int dD, int pW, int pH, int pD, int dilationW, int dilationH, int dilationD); using max_pool3d_backward_fn = void(*)(Tensor& grad_input, const Tensor& grad_output, const Tensor& indices); @@ -67,9 +80,9 @@ static inline T pooling_output_shape( TORCH_CHECK(stride != 0, "stride should not be zero"); TORCH_CHECK(pad >= 0, "pad must be non-negative, but got pad: ", pad); - TORCH_CHECK(pad <= kernelSize / 2, - "pad should be at most half of kernel size, but got pad=", - pad, " and kernel_size=", kernelSize) + TORCH_CHECK(pad <= ((kernelSize - 1) * dilation + 1) / 2, + "pad should be at most half of effective kernel size, but got pad=", + pad, ", kernel_size=", kernelSize, " and dilation=", dilation) return pooling_output_shape_pad_lr( inputSize, kernelSize, pad, pad, stride, dilation, ceil_mode); } diff --git a/aten/src/ATen/native/Pooling.cpp b/aten/src/ATen/native/Pooling.cpp index 89eb276ed418b..0dd877a552f1d 100644 --- a/aten/src/ATen/native/Pooling.cpp +++ b/aten/src/ATen/native/Pooling.cpp @@ -68,8 +68,7 @@ std::tuple adaptive_max_pool1d(const Tensor & self, IntArrayRef o " being empty"); } - Tensor output, indices; - std::tie(output, indices) = at::adaptive_max_pool2d( + auto [output, indices] = at::adaptive_max_pool2d( self.unsqueeze(-2), {1, output_size[0]}); @@ -94,8 +93,7 @@ std::tuple max_pool1d_with_indices( NoNamesGuard guard; - Tensor output, indices; - std::tie(output, indices) = at::max_pool2d_with_indices( + auto [output, indices] = at::max_pool2d_with_indices( self.unsqueeze(-2), {1, kernel_size[0]}, {1, stride[0]}, diff --git a/aten/src/ATen/native/QuantizedLinear.cpp b/aten/src/ATen/native/QuantizedLinear.cpp index 73fd1c1a94189..5fa45f3099844 100644 --- a/aten/src/ATen/native/QuantizedLinear.cpp +++ b/aten/src/ATen/native/QuantizedLinear.cpp @@ -64,7 +64,7 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( "and will be removed in a future PyTorch release.") const Tensor input_contig = input.contiguous(); - const float* input_ptr = input_contig.data_ptr(); + const float* input_ptr = input_contig.const_data_ptr(); TORCH_CHECK(input.dim() >= 2); // NOLINTNEXTLINE(bugprone-narrowing-conversions,cppcoreguidelines-narrowing-conversions) @@ -125,6 +125,9 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( auto& pack_b = cpp_custom_type_hack::cast>(packed); + int32_t* col_offsets_data = col_offsets.data_ptr(); + float* bias_contig_data = bias_contig.data_ptr(); + const int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { // This operation does the following: @@ -162,8 +165,8 @@ Tensor fbgemm_linear_int8_weight_fp32_activation( /*Aq_zero_point=*/q_params.zero_point, /*Bq_zero_point=*/&weight_zero_point_int32, /*row_offsets=*/pack_a.getRowOffsetBuffer(), - /*col_offsets=*/col_offsets.data_ptr(), - /*bias=*/bias_contig.data_ptr(), + /*col_offsets=*/col_offsets_data, + /*bias=*/bias_contig_data, /*nCol=*/N); // Do the GEMM fbgemm::fbgemmPacked( @@ -302,7 +305,7 @@ Tensor fbgemm_pack_quantized_matrix(const Tensor& weight) { const int64_t K = weight.size(1); const int64_t N = weight.size(0); const Tensor weight_contig = weight.contiguous(); - const int8_t* weight_ptr = weight_contig.data_ptr(); + const int8_t* weight_ptr = weight_contig.const_data_ptr(); auto ptr = std::make_unique>( /*trans=*/fbgemm::matrix_op_t::Transpose, /*nRow=*/K, @@ -421,7 +424,7 @@ Tensor fbgemm_linear_fp16_weight_fp32_activation( TORCH_CHECK(fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM."); const Tensor input_contig = input.contiguous(); - const float* input_ptr = input_contig.data_ptr(); + const float* input_ptr = input_contig.const_data_ptr(); // Pull out the PackedGemmMatrixFP16 instance from the owning tensor const fbgemm::PackedGemmMatrixFP16& packed_weight_fp16 = diff --git a/aten/src/ATen/native/RNN.cpp b/aten/src/ATen/native/RNN.cpp index c2a901989717c..97ce09ac8e51d 100644 --- a/aten/src/ATen/native/RNN.cpp +++ b/aten/src/ATen/native/RNN.cpp @@ -71,6 +71,12 @@ bool use_miopen(const at::Tensor& input, const double dropout_state) { (detail::getCUDAHooks().compiledWithMIOpen()) && (input.is_cuda()) && (at::globalContext().userEnabledCuDNN()); + // MIOpen functions returns miopenStatusBadParm on empty + // tensors. Maybe some functions actually support empty tensors, but + // native kernels shouldn't be much slower because the output is also + // likely empty. + if (input.sym_numel() == 0) return false; + return is_miopen_acceptable; } @@ -290,10 +296,7 @@ struct QuantizedCellParams : public CellParamsBase { } static c10::intrusive_ptr __setstate__( CellParamsSerializationType state) { - std::vector tensors; - std::vector doubles; - std::vector longs; - std::tie(std::ignore, tensors, doubles, longs, std::ignore) = + auto [_, tensors, doubles, longs, __] = std::move(state); TORCH_INTERNAL_ASSERT(tensors.size() == 6); TORCH_INTERNAL_ASSERT(doubles.size() == 2); @@ -338,12 +341,9 @@ c10::intrusive_ptr make_quantized_cell_params( std::make_tuple(std::move(packed_weight)), std::move(params)); }; - at::Tensor qw_ih, qw_hh, packed_ih, packed_hh, col_offsets_ih, col_offsets_hh; - at::Scalar scale_ih, scale_hh, zero_point_ih, zero_point_hh; - - std::tie(packed_ih, qw_ih, col_offsets_ih, scale_ih, zero_point_ih) = + auto [packed_ih, qw_ih, col_offsets_ih, scale_ih, zero_point_ih] = make_vals(w_ih); - std::tie(packed_hh, qw_hh, col_offsets_hh, scale_hh, zero_point_hh) = + auto [packed_hh, qw_hh, col_offsets_hh, scale_hh, zero_point_hh] = make_vals(w_hh); return c10::make_intrusive( @@ -438,10 +438,7 @@ struct QuantizedCellParamsDynamic : public CellParamsBase { } static c10::intrusive_ptr __setstate__( CellParamsSerializationType state) { - std::vector tensors; - std::vector> packed_params; - std::vector serialized_ints; - std::tie(std::ignore, tensors, std::ignore, serialized_ints, packed_params) = + auto [_, tensors, __, serialized_ints, packed_params] = std::move(state); TORCH_INTERNAL_ASSERT(tensors.size() == 2); TORCH_INTERNAL_ASSERT(packed_params.size() == 2); @@ -514,10 +511,7 @@ struct QuantizedCellParamsFP16 : public CellParamsBase { } static c10::intrusive_ptr __setstate__( CellParamsSerializationType state) { - std::vector> packed_params; - std::tie( - std::ignore, std::ignore, std::ignore, std::ignore, packed_params) = - std::move(state); + auto packed_params = std::get<4>(std::move(state)); TORCH_INTERNAL_ASSERT(packed_params.size() == 2); return make_quantized_cell_params_fp16( /*w_ih_packed=*/std::move(packed_params[0]), @@ -730,7 +724,7 @@ struct LSTMCell : Cell, cell_params> { const auto& hx = std::get<0>(hidden); const auto& cx = std::get<1>(hidden); - if (input.is_cuda()) { + if (input.is_cuda() || input.is_privateuseone()) { TORCH_CHECK(!pre_compute_input); auto igates = params.matmul_ih(input); auto hgates = params.matmul_hh(hx); @@ -766,7 +760,7 @@ struct GRUCell : Cell { const hidden_type& hidden, const cell_params& params, bool pre_compute_input = false) const override { - if (input.is_cuda() || input.is_xpu()) { + if (input.is_cuda() || input.is_xpu() || input.is_privateuseone()) { TORCH_CHECK(!pre_compute_input); auto igates = params.matmul_ih(input); auto hgates = params.matmul_hh(hidden); @@ -1167,7 +1161,7 @@ bool _use_cudnn_rnn_flatten_weight() { } // NB: This a (composite) wrapper for _thnn_fused_lstm_cell_backward_impl. -// It duplicates the outputs of this function so the non-composite verison doesn't have to. +// It duplicates the outputs of this function so the non-composite version doesn't have to. // The point is so that we avoid triggering TensorImpl use count asserts in debug mode std::tuple _thnn_fused_lstm_cell_backward( const c10::optional& grad_hy_opt, const c10::optional& grad_cy_opt, const Tensor& cx, const Tensor& cy, diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 5fa3e2b0af2d6..d29b177c13960 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -173,7 +174,7 @@ static void check_result_is_bytebool(const char* name, const Tensor& self, const // Note [all, any : uint8 compatibility]: // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -// For NumPy comptability, `all` and `any` return +// For NumPy compatibility, `all` and `any` return // Tensor of dtype `bool`. However for compatibility reason, // for `uint8`, they return Tensor of same dtype `uint8`. // Reference: https://github.com/pytorch/pytorch/pull/47878#issuecomment-747108561 @@ -510,7 +511,7 @@ static Tensor reversed_cumsum(const Tensor& w, int64_t dim) { Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, const Tensor& output) { /* We show here how to derive an O(n) gradient formula for - abitrary inputs. It follows via a basic application of the + arbitrary inputs. It follows via a basic application of the chain rule together with a number of observations for different cases. We assume that x is an n-dimensional vector and y = cumprod(x). In the actual implementation we will need to play a bit with masks @@ -527,7 +528,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co The term dF / dy_j is just grad_output[j] (assuming again everything is one-dimensional). - The term (dy_j / dx_k) is easilly seen to be + The term (dy_j / dx_k) is easily seen to be if j >= k dy_j / dx_k = prod_{1 <= i <= j, i != k} x_i @@ -589,7 +590,7 @@ Tensor cumprod_backward(const Tensor& grad, const Tensor& input, int64_t dim, co dy_j / dx_z1 = prod(x[:z1]) * (grad_output[z1] + sum(grad_output[z1+1:z2] * cumprod(x[z1+1:z2]))) - When the imputs are complex, this is map is holomorphic. As such, to compute + When the inputs are complex, this is map is holomorphic. As such, to compute its backwards is just the conjugate of the usual backwards. This simplifies to conjugating the input. We may also reuse the output as, since the map is holomorphic, cumprod(input.conj()) = cumprod(input).conj() @@ -1170,6 +1171,25 @@ std::vector gradient(const Tensor& self, IntArrayRef dim, int64_t edge_o // ALL REDUCE ################################################################# +inline bool should_use_acc_buffer(at::TensorIterator& iter) { + const auto ndim = iter.ndim(); + if (!iter.device().is_cpu() || iter.noutputs() != 1) { + return false; + } + if (!at::isReducedFloatingType(iter.common_dtype())) { + return false; + } + if (ndim < 2) { + return false; + } + auto out_strides = iter.strides(0); + for (const auto dim : c10::irange(0, 2)) { + if (out_strides[dim] != 0) { + return false; + } + } + return true; +} TORCH_IMPL_FUNC(sum_out) (const Tensor& self, @@ -1181,7 +1201,19 @@ TORCH_IMPL_FUNC(sum_out) if (iter.numel() == 0) { result.zero_(); } else { - sum_stub(iter.device_type(), iter); + // Here is a limitation of TensorIterator reductions for permuted input with lower precision on CPU. + // Consider the case: TensorIterator coalesces such input and output to >= 2 dims tensors, + // and the output stride is [0, 0, x, x, ...] with x >= 0 (two reduced dimensions and non-reduced dims). + // Since the reduction loop only operates on two dimensions at a time, + // the intermediate sums is forced to do accumulation in the second reduced dim with lower precision. + // See https://github.com/pytorch/pytorch/issues/83149 + if (should_use_acc_buffer(iter)) { + auto tmp_output = at::empty(result.sizes(), result.options().dtype(kFloat)); + at::sum_outf(self.to(ScalarType::Float), opt_dim, keepdim, /*dtype=*/c10::nullopt, tmp_output); + result.copy_(tmp_output); + } else{ + sum_stub(iter.device_type(), iter); + } } } @@ -1250,7 +1282,7 @@ Tensor trace_cpu(const Tensor& self) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX(self.scalar_type(), "trace", [&] { using accscalar_t = at::acc_type; accscalar_t sum = 0; - const auto* t_data = self.data_ptr(); + const auto* t_data = self.const_data_ptr(); int64_t t_stride_0, t_stride_1, t_diag_size; @@ -1726,7 +1758,7 @@ static double std_var_all_cpu(const Tensor& self, double correction, bool take_s auto mean = self.mean().item(); auto iter = TensorIteratorConfig() - .add_input(self) + .add_const_input(self) .build(); auto reduction = [&](int64_t begin, int64_t end, double thread_sum) { @@ -2197,7 +2229,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { return true; } std::atomic result{true}; - auto iter = TensorIteratorConfig().add_input(self).build(); + auto iter = TensorIteratorConfig().add_const_input(self).build(); AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(kHalf, kBFloat16, iter.input_dtype(), "equal_notnan_cpu", [&] { iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) { if (!result) { @@ -2218,13 +2250,13 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { std::atomic result{true}; auto iter = TensorIteratorConfig() - .add_input(self) - .add_input(other) + .add_const_input(self) + .add_const_input(other) .allow_cpu_scalars(true) .promote_inputs_to_common_dtype(true) .build(); - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(kBool, kBFloat16, kHalf, iter.input_dtype(), "equal_cpu", [&] { + AT_DISPATCH_V2(iter.input_dtype(), "equal_cpu", AT_WRAP([&] { iter.for_each([&](char** data, const int64_t *strides, int64_t dim_size) { if (!result) { return; @@ -2240,7 +2272,7 @@ bool cpu_equal(const Tensor& self, const Tensor& other) { other_data += strides[1]; } }); - }); + }), kBool, kBFloat16, kHalf, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); return result.load(); } diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h index bec04f0cd935b..6989b00f6f3e6 100644 --- a/aten/src/ATen/native/ReduceOpsUtils.h +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -368,7 +368,13 @@ static void resize_reduction( DimVector dims_ = at::native::make_dim_vector(opt_dims, self.dim()); maybe_wrap_dims(dims_, self.dim()); auto shape = get_reduction_shape(self, dims_, keepdim, allow_empty_dims); - meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); + if (self.layout() == kStrided) { + meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype)); + } else if (shape.size() == 0) { + meta.set_output_raw_strided(0, shape, {}, self.options().dtype(out_dtype).layout(kStrided)); + } else { + TORCH_CHECK(false, "resize_reduction: support for output with ", self.layout(), " layout is not implemented yet"); + } namedinference::propagate_names_for_reduction( meta.maybe_get_output(), self, dims_, keepdim); } diff --git a/aten/src/ATen/native/Repeat.cpp b/aten/src/ATen/native/Repeat.cpp index 81d6be7b85200..dd87cead1f480 100644 --- a/aten/src/ATen/native/Repeat.cpp +++ b/aten/src/ATen/native/Repeat.cpp @@ -16,8 +16,8 @@ template static void compute_cpu( - index_t* repeat_ptr, - int64_t* cumsum_ptr, + const index_t* repeat_ptr, + const int64_t* cumsum_ptr, index_t* result_ptr, int64_t size, int64_t result_size) { diff --git a/aten/src/ATen/native/Repeat.h b/aten/src/ATen/native/Repeat.h index a90ed815f9352..e9a471d16f931 100644 --- a/aten/src/ATen/native/Repeat.h +++ b/aten/src/ATen/native/Repeat.h @@ -14,7 +14,7 @@ namespace at::native { template < typename index_t, - void compute(index_t*, int64_t*, index_t*, int64_t, int64_t)> + void compute(const index_t*, const int64_t*, index_t*, int64_t, int64_t)> static inline Tensor repeat_interleave_common( const Tensor& repeats, c10::optional output_size) { @@ -38,8 +38,8 @@ static inline Tensor repeat_interleave_common( } Tensor result = at::empty({total}, repeats.options()); - index_t* repeat_ptr = repeats_.data_ptr(); - int64_t* cumsum_ptr = cumsum.data_ptr(); + const index_t* repeat_ptr = repeats_.const_data_ptr(); + const int64_t* cumsum_ptr = cumsum.const_data_ptr(); index_t* result_ptr = result.data_ptr(); compute(repeat_ptr, cumsum_ptr, result_ptr, repeats.size(0), total); return result; diff --git a/aten/src/ATen/native/ReplicationPadding.cpp b/aten/src/ATen/native/ReplicationPadding.cpp index 13d784b1fbb20..d0762d28459f6 100644 --- a/aten/src/ATen/native/ReplicationPadding.cpp +++ b/aten/src/ATen/native/ReplicationPadding.cpp @@ -25,6 +25,7 @@ namespace at::meta { TORCH_META_FUNC(replication_pad1d) ( const Tensor& input, IntArrayRef paddingSize // no out argument! ) { + TORCH_CHECK(paddingSize.size() == 2, "padding size is expected to be 2"); int64_t dimw = 1; int64_t dimslices = 0; @@ -85,6 +86,7 @@ TORCH_META_FUNC(replication_pad1d_backward) ( TORCH_META_FUNC(replication_pad2d) ( const Tensor& input, IntArrayRef paddingSize ) { + TORCH_CHECK(paddingSize.size() == 4, "padding size is expected to be 4"); int64_t pad_l = paddingSize[0]; int64_t pad_r = paddingSize[1]; int64_t pad_t = paddingSize[2]; @@ -124,6 +126,7 @@ TORCH_META_FUNC(replication_pad2d) ( TORCH_META_FUNC(replication_pad3d) ( const Tensor& input, IntArrayRef paddingSize ) { + TORCH_CHECK(paddingSize.size() == 6, "padding size is expected to be 6"); int64_t pleft = paddingSize[0]; int64_t pright = paddingSize[1]; int64_t ptop = paddingSize[2]; diff --git a/aten/src/ATen/native/Resize.cpp b/aten/src/ATen/native/Resize.cpp index 415d3d65bef42..be88538ed7082 100644 --- a/aten/src/ATen/native/Resize.cpp +++ b/aten/src/ATen/native/Resize.cpp @@ -94,13 +94,14 @@ void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes) { if (size_bytes != 0) { new_data = storage->allocator()->allocate(size_bytes); } - at::DataPtr old_data = storage->set_data_ptr(std::move(new_data)); + const at::DataPtr& old_data = storage->data_ptr(); const auto old_capacity = storage->nbytes(); - storage->set_nbytes(size_bytes); const auto copy_capacity = std::min(size_bytes, old_capacity); if (old_data != nullptr && copy_capacity > 0) { - memcpy(storage->mutable_data(), old_data.get(), copy_capacity); + memcpy(new_data.get(), old_data.get(), copy_capacity); } + storage->set_data_ptr_noswap(std::move(new_data)); + storage->set_nbytes(size_bytes); } // Call the sparse implementation in SparseTensor.cpp directly. @@ -281,4 +282,50 @@ const Tensor& resize__symint( return _resize_(self, size, optional_memory_format); } +void resize_bytes_nocuda(const Storage& storage, c10::SymInt newsize) { + // handles all devices except cuda (which needs to be in a different .so) + c10::DeviceType device_type = storage.device_type(); + if (device_type == at::kCPU) { + at::native::resize_bytes_cpu(storage.unsafeGetStorageImpl(), newsize.expect_int()); + } else if (device_type == at::kMeta) { + at::native::resize_bytes_meta(storage.unsafeGetStorageImpl(), newsize); + } else if (device_type == at::kPrivateUse1) { + at::GetPrivateUse1HooksInterface()->resizePrivateUse1Bytes( + storage, newsize.expect_int()); + } else if (device_type == at::kXPU || device_type == at::kHPU) { + ptrdiff_t size_bytes_i = newsize.expect_int(); + TORCH_CHECK( + !c10::overflows(size_bytes_i), + "Requested storage size (", + size_bytes_i, + ") cannot be represented as a int64_t"); + const auto size_bytes = static_cast(size_bytes_i); + void* original_data_ptr = storage.data_ptr().get(); + + auto src_option = + c10::TensorOptions().device(storage.device()).dtype(at::kByte); + auto src_tensor = at::empty({0}, src_option).set_(storage); + src_tensor.resize_({size_bytes}); + + // When using resize_ to replace resize_bytes_xxx, in some cases + // the original data_ptr is still returned, which is an inconsistent + // behavior when compared to resize_bytes_xxx. For these cases, + // an additional memory copy and update for storage are required. + if (original_data_ptr == src_tensor.storage().data_ptr().get()) { + auto new_tensor = at::empty(src_tensor.sizes(), src_tensor.options()); + new_tensor.copy_(src_tensor); + storage.set_data_ptr_noswap( + std::move(new_tensor.storage().mutable_data_ptr())); + storage.unsafeGetStorageImpl()->set_allocator( + new_tensor.storage().unsafeGetStorageImpl()->allocator()); + storage.set_nbytes(new_tensor.storage().nbytes()); + } + } else { + TORCH_CHECK( + false, + "UntypedStorage.resize_: got unexpected device type ", + device_type); + } +} + } // namespace at::native diff --git a/aten/src/ATen/native/Resize.h b/aten/src/ATen/native/Resize.h index b752b91e04f38..0a1f21298957d 100644 --- a/aten/src/ATen/native/Resize.h +++ b/aten/src/ATen/native/Resize.h @@ -38,6 +38,7 @@ TORCH_API bool resize_output_check_symint(const Tensor& output, SymIntArrayRef s TORCH_API void resize_bytes_cpu(StorageImpl* storage, size_t size_bytes); TORCH_API void resize_bytes_meta(StorageImpl* storage, c10::SymInt size_bytes); +TORCH_API void resize_bytes_nocuda(const Storage& storage, c10::SymInt size_bytes); static inline void maybe_resize_storage_cpu(TensorImpl* self, size_t new_size_bytes) { // It does not make sense to try to resize a storage diff --git a/aten/src/ATen/native/Scalar.cpp b/aten/src/ATen/native/Scalar.cpp index 1d7adc5f569f8..ec19449d4133e 100644 --- a/aten/src/ATen/native/Scalar.cpp +++ b/aten/src/ATen/native/Scalar.cpp @@ -40,7 +40,7 @@ Scalar _local_scalar_dense_cpu(const Tensor& self) { self.scalar_type(), "_local_scalar_dense_cpu", AT_WRAP([&] { - scalar_t value = *self.data_ptr(); + scalar_t value = *self.const_data_ptr(); r = Scalar(value); }), AT_EXPAND(AT_SD_TYPES) diff --git a/aten/src/ATen/native/SegmentReduce.cpp b/aten/src/ATen/native/SegmentReduce.cpp index 61d2a1f60ca11..3c7b539ee4b6d 100644 --- a/aten/src/ATen/native/SegmentReduce.cpp +++ b/aten/src/ATen/native/SegmentReduce.cpp @@ -52,7 +52,7 @@ void _segment_reduce_lengths_cpu_kernel1( AT_DISPATCH_FLOATING_TYPES_AND2( kBFloat16, kHalf, data.scalar_type(), "_segment_reduce_cpu", [&]() { auto* output_data = output.data_ptr(); - const auto* values_data = data.data_ptr(); + const auto* values_data = data.const_data_ptr(); for (const auto outer_idx : c10::irange(outer_offset)) { int64_t segment_start, segment_length; int64_t segment_end = is_offsets_like ? @@ -145,7 +145,7 @@ Tensor _segment_reduce_lengths_cpu_kernel( auto output = at::empty(output_shape, data.options()); AT_DISPATCH_INDEX_TYPES(lengths.scalar_type(), "_segment_reduce_lengths_cpu_kernel1", [&]() { - const auto* lengths_data = lengths.data_ptr(); + const auto* lengths_data = lengths.const_data_ptr(); _segment_reduce_lengths_cpu_kernel1( reduction, data, lengths_data, axis, initial, output, segment_count, lengths_stride_axis); }); @@ -171,7 +171,7 @@ Tensor _segment_reduce_offsets_cpu_kernel( auto output = at::empty(output_shape, data.options()); AT_DISPATCH_INDEX_TYPES(offsets.scalar_type(), "_segment_reduce_offsets_cpu_kernel1", [&]() { - const auto* offsets_data = offsets.data_ptr(); + const auto* offsets_data = offsets.const_data_ptr(); _segment_reduce_lengths_cpu_kernel1( reduction, data, offsets_data, axis, initial, output, segment_count, offsets_stride_axis); }); @@ -211,10 +211,10 @@ void _segment_reduce_cpu_lengths_backward_kernel1( data_contig.scalar_type(), "_segment_reduce_cpu", [&]() { - auto* output_data = output_contig.data_ptr(); - auto* grad_data = grad_contig.data_ptr(); + auto* output_data = output_contig.const_data_ptr(); + auto* grad_data = grad_contig.const_data_ptr(); auto* grad_input_data = grad_input.mutable_data_ptr(); - const auto* values_data = data_contig.data_ptr(); + const auto* values_data = data_contig.const_data_ptr(); // Used to calculate exclusive prod scalar_t initial_prod_value; if (reduction == ReductionType::PROD) { @@ -331,7 +331,7 @@ Tensor _segment_reduce_cpu_lengths_backward_kernel( AT_DISPATCH_INDEX_TYPES( lengths_contig.scalar_type(), "_segment_reduce_cpu_lengths_backward_kernel1", [&] { - const auto* lengths_data = lengths_contig.data_ptr(); + const auto* lengths_data = lengths_contig.const_data_ptr(); _segment_reduce_cpu_lengths_backward_kernel1( grad_contig, output_contig, @@ -364,7 +364,7 @@ Tensor _segment_reduce_cpu_offsets_backward_kernel( AT_DISPATCH_INDEX_TYPES( offsets_contig.scalar_type(), "_segment_reduce_cpu_offsets_backward_kernel1", [&] { - const auto* offsets_data = offsets_contig.data_ptr(); + const auto* offsets_data = offsets_contig.const_data_ptr(); _segment_reduce_cpu_lengths_backward_kernel1( grad_contig, output_contig, diff --git a/aten/src/ATen/native/SobolEngineOpsUtils.cpp b/aten/src/ATen/native/SobolEngineOpsUtils.cpp index 1e129673accdd..3d492221c5057 100644 --- a/aten/src/ATen/native/SobolEngineOpsUtils.cpp +++ b/aten/src/ATen/native/SobolEngineOpsUtils.cpp @@ -3,7 +3,7 @@ #include /* -The direction nubmers in this file were generated using the +The direction numbers in this file were generated using the python script below (thius this assumes that the file https://web.maths.unsw.edu.au/~fkuo/sobol/new-joe-kuo-6.21201 is present in the working directory). For additional details see [1]. diff --git a/aten/src/ATen/native/SoftMax.cpp b/aten/src/ATen/native/SoftMax.cpp index 30bb6da1d8642..bd321a0a88e7a 100644 --- a/aten/src/ATen/native/SoftMax.cpp +++ b/aten/src/ATen/native/SoftMax.cpp @@ -637,8 +637,8 @@ Tensor masked_softmax_cpu(const Tensor& input_, const Tensor& mask_, const c10:: input = input.view(1); } - AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::BFloat16, input.scalar_type(), "masked_softmax", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::BFloat16, at::ScalarType::Half, input.scalar_type(), "masked_softmax", [&] { host_softmax< scalar_t, false /* LogSoftMax */, @@ -670,8 +670,8 @@ Tensor masked_softmax_backward_cpu( mask = mask.dim() == 0 ? mask.view(1) : mask; Tensor grad_input = at::empty_like(grad, grad.options()); - AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::BFloat16, grad.scalar_type(), "masked_softmax_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::BFloat16, at::ScalarType::Half, grad.scalar_type(), "masked_softmax_backward", [&] { host_softmax_backward< scalar_t, false /* LogSoftMax */, diff --git a/aten/src/ATen/native/Sorting.cpp b/aten/src/ATen/native/Sorting.cpp index 91f05e367fed2..b31007408c7ae 100644 --- a/aten/src/ATen/native/Sorting.cpp +++ b/aten/src/ATen/native/Sorting.cpp @@ -72,9 +72,6 @@ TORCH_META_FUNC(topk) TORCH_META_FUNC2(sort, stable) (const Tensor& self, c10::optional stable, int64_t dim, bool descending) { - TORCH_INTERNAL_ASSERT( - stable.has_value(), - "sort(): c10::optional for stable has to have value."); maybe_wrap_dim(dim, self.dim()); // See issue: https://github.com/pytorch/pytorch/issues/65863 @@ -549,7 +546,7 @@ std::tuple median_with_indices_impl( .declare_static_shape(sizes, /*squash_dims=*/dim) .add_output(vals) .add_output(inds) - .add_input(in) + .add_const_input(in) .build(); AT_DISPATCH_ALL_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, in.scalar_type(), "median_out", [&] { @@ -953,7 +950,7 @@ TORCH_IMPL_FUNC(sort_stable_out) indices.zero_(); } else { dim = maybe_wrap_dim(dim, self.dim()); - sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value()); + sort_stub(self.device().type(), self, values, indices, dim, descending, stable.value_or(false)); } } diff --git a/aten/src/ATen/native/SpectralOps.cpp b/aten/src/ATen/native/SpectralOps.cpp index 26422ed8130a7..7ed068874e68a 100644 --- a/aten/src/ATen/native/SpectralOps.cpp +++ b/aten/src/ATen/native/SpectralOps.cpp @@ -83,8 +83,7 @@ ScalarType promote_type_fft(ScalarType type, bool require_complex, Device device const bool maybe_support_half = ( // Only CUDA supports half precision, but since meta tensors don't have a // device we err on the side of accepting it - (device.is_cuda() || device.is_meta()) && - !at::detail::getCUDAHooks().hasROCM() + device.is_cuda() || device.is_meta() ); if (maybe_support_half) { TORCH_CHECK(type == kHalf || type == kFloat || type == kDouble, "Unsupported dtype ", type); diff --git a/aten/src/ATen/native/SpectralOpsUtils.h b/aten/src/ATen/native/SpectralOpsUtils.h index 7d9852b8e7b0b..279e4ff595567 100644 --- a/aten/src/ATen/native/SpectralOpsUtils.h +++ b/aten/src/ATen/native/SpectralOpsUtils.h @@ -3,7 +3,11 @@ #include #include #include +#include +#include +#include #include +#include namespace at::native { diff --git a/aten/src/ATen/native/SummaryOps.cpp b/aten/src/ATen/native/SummaryOps.cpp index 81a0ccd6d8337..4c158f81a47e9 100644 --- a/aten/src/ATen/native/SummaryOps.cpp +++ b/aten/src/ATen/native/SummaryOps.cpp @@ -43,7 +43,7 @@ Tensor _bincount_cpu_template( int64_t nbins = static_cast(*self.max().data_ptr()) + 1L; nbins = std::max(nbins, minlength); // at least minlength # of bins - const input_t* self_p = self.data_ptr(); + const input_t* self_p = self.const_data_ptr(); if (has_weights) { output = at::zeros( {nbins}, @@ -52,7 +52,7 @@ Tensor _bincount_cpu_template( weights.options().device_opt(), weights.options().pinned_memory_opt()); weights_t* output_p = output.data_ptr(); - const weights_t* weights_p = weights.data_ptr(); + const weights_t* weights_p = weights.const_data_ptr(); for (const auto i : c10::irange(self_size)) { output_p[self_p[i]] += weights_p[i]; } diff --git a/aten/src/ATen/native/TensorAdvancedIndexing.cpp b/aten/src/ATen/native/TensorAdvancedIndexing.cpp index ba00cbf344569..f1e385d8eeac8 100644 --- a/aten/src/ATen/native/TensorAdvancedIndexing.cpp +++ b/aten/src/ATen/native/TensorAdvancedIndexing.cpp @@ -408,9 +408,9 @@ static void build_index_op( config.set_check_mem_overlap(false) .check_all_same_dtype(false) .add_output(result) - .add_owned_input(info.src); + .add_owned_const_input(info.src); for (auto& index : info.indices) { - config.add_owned_input(index); + config.add_owned_const_input(index); } if (!result.defined()) { config.declare_static_dtype_and_device(info.src.scalar_type(), info.src.device()); @@ -614,9 +614,9 @@ static TensorIterator make_index_put_iterator(const AdvancedIndex& info, const T config.resize_outputs(false); config.check_all_same_dtype(false); config.add_output(info.src); - config.add_input(value); + config.add_const_input(value); for (auto& index : info.indices) { - config.add_input(index); + config.add_const_input(index); } return config.build(); } @@ -689,8 +689,8 @@ Tensor & put_(Tensor & self, const Tensor& index, const Tensor & source, const b auto iter = TensorIteratorConfig() .set_check_mem_overlap(false) .check_all_same_dtype(false) - .add_input(source) - .add_input(index_reshaped) + .add_const_input(source) + .add_const_input(index_reshaped) .build(); put_stub(iter.device_type(), iter, self, accumulate); @@ -769,7 +769,7 @@ Tensor& take_out(const Tensor& self, const Tensor& index, Tensor& out) { .set_check_mem_overlap(false) .check_all_same_dtype(false) .add_output(out) - .add_input(index) + .add_const_input(index) .build(); // Early return after out has been resized @@ -848,8 +848,8 @@ TORCH_IMPL_FUNC(index_copy_out) .check_all_same_dtype(false) .resize_outputs(false) .add_output(result_restrided) - .add_input(index_restrided) - .add_input(source_nonzero) + .add_const_input(index_restrided) + .add_const_input(source_nonzero) .build(); auto result_dim_size = result_nonzero.size(dim); @@ -943,15 +943,15 @@ TORCH_IMPL_FUNC(index_add_cpu_out) auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cpu_", [&] () { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); auto self_data = static_cast(selfSlice.data_ptr()) + self_i * self_stride_bytes; - auto source_data = static_cast(sourceSlice.data_ptr()) + i * source_stride_bytes; + auto source_data = static_cast(sourceSlice.const_data_ptr()) + i * source_stride_bytes; iter.unsafe_replace_operand(0, self_data); iter.unsafe_replace_operand(1, self_data); - iter.unsafe_replace_operand(2, source_data); + iter.unsafe_replace_operand(2, const_cast(source_data)); add_stub(iter.device_type(), iter, alpha); } }); @@ -967,10 +967,10 @@ TORCH_IMPL_FUNC(index_add_cpu_out) auto source_stride = source.dim() == 0 ? 1 : source.stride(dim); // TODO: Maybe TensorAccessor can be used here? auto* result_ptr = result.data_ptr(); - auto* source_ptr = source.data_ptr(); + auto* source_ptr = source.const_data_ptr(); AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_add_cpu_", [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &alpha_value] { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self"); @@ -1040,15 +1040,15 @@ static void index_reduce_func_impl( auto iter = TensorIterator::borrowing_binary_op(selfSlice, selfSlice, sourceSlice); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_func_cpu_", [&] () { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); auto self_data = static_cast(selfSlice.data_ptr()) + self_i * self_stride_bytes; - auto source_data = static_cast(sourceSlice.data_ptr()) + i * source_stride_bytes; + auto source_data = static_cast(sourceSlice.const_data_ptr()) + i * source_stride_bytes; iter.unsafe_replace_operand(0, self_data); iter.unsafe_replace_operand(1, self_data); - iter.unsafe_replace_operand(2, source_data); + iter.unsafe_replace_operand(2, const_cast(source_data)); switch (op) { case ReductionType::PROD : @@ -1090,11 +1090,11 @@ static void index_reduce_func_impl( auto counts_stride = counts.dim() == 0 ? 1 : counts.stride(dim); // TODO: Maybe TensorAccessor can be used here? auto* result_ptr = result.data_ptr(); - auto* source_ptr = source.data_ptr(); + auto* source_ptr = source.const_data_ptr(); auto counts_ptr = counts.data_ptr(); AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_func_cpu_", [&index_contig, &numel, &result, &result_ptr, &result_stride, &source_ptr, &source_stride, &op, &counts_ptr, &counts_stride] { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < result.numel()), "index out of range in self"); @@ -1175,7 +1175,7 @@ static Tensor & index_select_out_cpu_dim1_( auto out = static_cast(result_contig.data_ptr()); - auto src_base = static_cast(self_contig.data_ptr()); + auto src_base = static_cast(self_contig.const_data_ptr()); auto self_sizes = self_contig.sizes(); auto outer_dims_product = c10::size_to_dim_(1, self_sizes); @@ -1191,7 +1191,7 @@ static Tensor & index_select_out_cpu_dim1_( AT_DISPATCH_INDEX_TYPES( index_contig.scalar_type(), "batch_index_select_compute", [&]() { - const auto* idxs = index_contig.data_ptr(); + const auto* idxs = index_contig.const_data_ptr(); check_indexarray_range(idxs, N, src_indexing_axis_dim); // Special-case single-float copy for efficiency @@ -1256,7 +1256,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & "index_select(): self indexing axis dim should be positive"); AT_DISPATCH_INDEX_TYPES( index_contig.scalar_type(), "index_select_empty_self_bound_check", [&]() { - const auto* idxs = index_contig.data_ptr(); + const auto* idxs = index_contig.const_data_ptr(); check_indexarray_range(idxs, numel, src_indexing_axis_dim); }); return result; @@ -1269,7 +1269,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & auto selfSlice = self.select(dim, 0); auto resultSlice = result.select(dim, 0); - auto selfSlice_data = selfSlice.data_ptr(); + auto selfSlice_data = selfSlice.const_data_ptr(); auto resultSlice_data = resultSlice.data_ptr(); auto self_stride_bytes = self.stride(dim) * elementSize(self.scalar_type()); auto result_stride_bytes = result.stride(dim) * elementSize(result.scalar_type()); @@ -1280,7 +1280,7 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & .check_all_same_dtype(false) .resize_outputs(false) .add_output(resultSlice) - .add_input(selfSlice) + .add_const_input(selfSlice) .build(); auto grain_size = at::internal::GRAIN_SIZE; @@ -1293,14 +1293,14 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_", [&index_contig, &start, &end, &sub_iter, &self_dim_size, &selfSlice_data, &self_stride_bytes, &resultSlice_data, &result_stride_bytes] () { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(start, end)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); - auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; + auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; auto result_data = static_cast(resultSlice_data) + i * result_stride_bytes; sub_iter.unsafe_replace_operand(0, result_data); - sub_iter.unsafe_replace_operand(1, self_data); + sub_iter.unsafe_replace_operand(1, const_cast(self_data)); copy_stub(sub_iter.device_type(), sub_iter, false); }; }); @@ -1322,11 +1322,11 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_", [&index_contig, &slice_size_bytes, &self_dim_size, &selfSlice_data, &self_stride_bytes, &resultSlice_data, &result_stride_bytes, &start, &end] () { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(start, end)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_dim_size), "index out of range in self"); - auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; + auto self_data = static_cast(selfSlice_data) + self_i * self_stride_bytes; auto result_data = static_cast(resultSlice_data) + i * result_stride_bytes; memcpy(result_data, self_data, slice_size_bytes); } @@ -1344,16 +1344,16 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & AT_DISPATCH_QINT_TYPES(self.scalar_type(), "index_select_quant", [&index_contig, &self, &result, &dim, &numel] { auto self_stride = self.dim() == 0 ? 1 : self.stride(dim); auto result_stride = result.dim() == 0 ? 1 : result.stride(dim); - auto self_data_ptr = self.data_ptr(); + auto self_data_ptr = self.const_data_ptr(); auto result_data_ptr = result.data_ptr(); auto self_numel = self.numel(); AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_quant_", [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self"); - scalar_t *self_ip = self_data_ptr + self_i * self_stride; + const scalar_t *self_ip = self_data_ptr + self_i * self_stride; *(result_data_ptr + i * result_stride) = *self_ip; } }); @@ -1364,16 +1364,16 @@ Tensor & index_select_out_cpu_(const Tensor & self, int64_t dim, const Tensor & auto self_stride = self.dim() == 0 ? 1 : self.stride(dim); auto result_stride = result.dim() == 0 ? 1 : result.stride(dim); - auto self_data_ptr = self.data_ptr(); + auto self_data_ptr = self.const_data_ptr(); auto result_data_ptr = result.data_ptr(); auto self_numel = self.numel(); AT_DISPATCH_INDEX_TYPES(index_contig.scalar_type(), "index_select_out_cpu_", [&index_contig, &numel, &self_numel, &self_data_ptr, &self_stride, &result_data_ptr, &result_stride] { - auto index_data = index_contig.data_ptr(); + auto index_data = index_contig.const_data_ptr(); for (const auto i : c10::irange(numel)) { auto self_i = index_data[i]; TORCH_CHECK_INDEX((self_i >= 0) && (self_i < self_numel), "index out of range in self"); - scalar_t *self_ip = self_data_ptr + self_i * self_stride; + const scalar_t *self_ip = self_data_ptr + self_i * self_stride; *(result_data_ptr + i * result_stride) = *self_ip; } }); @@ -1462,7 +1462,7 @@ Tensor & index_fill_(Tensor & self, int64_t dim, const Tensor & index, const Sca .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(index_restrided) + .add_const_input(index_restrided) .build(); auto self_dim_size = (self_nonzero_dim.sizes())[dim]; @@ -1880,8 +1880,7 @@ TORCH_IMPL_FUNC(scatter_reduce_two) } Tensor masked_scatter(const Tensor & self, const Tensor & mask, const Tensor & source) { - c10::MaybeOwned _mask, _self; - std::tie(_mask, _self) = expand_outplace(mask, self); + auto [_mask, _self] = expand_outplace(mask, self); return _self->clone(at::MemoryFormat::Contiguous).masked_scatter_(*_mask, source); } @@ -1924,7 +1923,7 @@ static Tensor & masked_fill_impl_cpu(Tensor & self, const Tensor & mask, const S .check_all_same_dtype(false) .resize_outputs(false) .add_output(self) - .add_input(mask) + .add_const_input(mask) .build(); masked_fill_stub(iter.device_type(), iter, value); @@ -1954,8 +1953,7 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Scalar& sourc auto maybe_outnames = namedinference::broadcast_to_outnames(mask, self, "masked_fill"); { NoNamesGuard guard; - c10::MaybeOwned _mask, _self; - std::tie(_mask, _self) = expand_outplace(mask, self); + auto [_mask, _self] = expand_outplace(mask, self); result = _self->clone(at::MemoryFormat::Contiguous); result.masked_fill_(mask, source); } @@ -1968,8 +1966,7 @@ Tensor masked_fill(const Tensor & self, const Tensor & mask, const Tensor & sour auto maybe_outnames = namedinference::broadcast_to_outnames(mask, self, "masked_fill"); { NoNamesGuard guard; - c10::MaybeOwned _mask, _self; - std::tie(_mask, _self) = expand_outplace(mask, self); + auto [_mask, _self] = expand_outplace(mask, self); result = _self->clone(at::MemoryFormat::Contiguous); result.masked_fill_(mask, source); } @@ -1989,8 +1986,7 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, at::assert_no_overlap(result, self); at::assert_no_overlap(result, mask); - c10::MaybeOwned _mask, _self; - std::tie(_mask, _self) = expand_outplace(mask, self); + auto [_mask, _self] = expand_outplace(mask, self); auto shape = _self->sizes(); int64_t numel = _mask->sum().item().toLong(); @@ -2017,8 +2013,8 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, .check_all_same_dtype(false) .resize_outputs(false) .add_output(result_strided) - .add_input(*_self) - .add_input(*_mask) + .add_const_input(*_self) + .add_const_input(*_mask) .build(); masked_select_serial_stub(iter.device_type(), iter, orig_stride); @@ -2041,9 +2037,9 @@ static Tensor & masked_select_out_impl_cpu(Tensor & result, const Tensor & self, .check_all_same_dtype(false) .resize_outputs(false) .add_output(result_strided) - .add_input(*_self) - .add_input(*_mask) - .add_input(mask_prefix_sum) + .add_const_input(*_self) + .add_const_input(*_mask) + .add_const_input(mask_prefix_sum) .build(); masked_select_stub(iter.device_type(), iter, orig_stride); @@ -2130,10 +2126,7 @@ static inline void checkDevice(CheckedFrom c, at::ArrayRef tensors, Devi Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional opt_dim) { checkDevice("torch.take_along_dim():", {self, indices}, self.device()); if (opt_dim.has_value()) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t dim; - Tensor self_broadcasted, indices_broadcasted; - std::tie(self_broadcasted, indices_broadcasted, dim) = + auto [self_broadcasted, indices_broadcasted, dim] = _take_along_dim_helper(self, indices, opt_dim.value()); return self_broadcasted.gather(dim, indices_broadcasted); } @@ -2145,10 +2138,7 @@ Tensor take_along_dim(const Tensor& self, const Tensor& indices, c10::optional opt_dim, Tensor& result) { checkDevice("torch.take_along_dim():", {self, indices, result}, self.device()); if (opt_dim.has_value()) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t dim; - Tensor self_broadcasted, indices_broadcasted; - std::tie(self_broadcasted, indices_broadcasted, dim) = + auto [self_broadcasted, indices_broadcasted, dim] = _take_along_dim_helper(self, indices, opt_dim.value()); return at::gather_out(result, self_broadcasted, dim, indices_broadcasted); } @@ -2228,7 +2218,7 @@ Tensor count_nonzero_cpu(const Tensor& self, IntArrayRef dims){ // Optimized all-reduce auto iter = TensorIteratorConfig() - .add_input(self) + .add_const_input(self) .build(); const auto num_threads = at::get_num_threads(); @@ -2267,7 +2257,7 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) { at::assert_no_overlap(result, self); auto iter = TensorIteratorConfig() - .add_input(self) + .add_const_input(self) .enforce_linear_iteration() .build(); @@ -2303,6 +2293,8 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) { return result; } + auto out_accessor = result.accessor(); + // Pass 2: Write indexes AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( kComplexHalf, kHalf, kBFloat16, kBool, self.scalar_type(), "nonzero_cpu", [&] { @@ -2323,7 +2315,6 @@ Tensor& nonzero_out_cpu(const Tensor& self, Tensor& result) { } } - auto out_accessor = result.accessor(); auto out_ptr = out_accessor[thread_count_nonzero[tid]].data(); auto loop = [&](char** data, const int64_t* strides, int64_t n1, int64_t n2) { @@ -2495,7 +2486,7 @@ Tensor & masked_scatter__cpu(Tensor& self, const Tensor & mask, const Tensor & s // order of indexing matters .enforce_linear_iteration() .add_output(self) - .add_input(*b_mask) + .add_const_input(*b_mask) .build(); masked_scatter_stub(iter.device_type(), iter, src_cont); diff --git a/aten/src/ATen/native/TensorCompare.cpp b/aten/src/ATen/native/TensorCompare.cpp index f7a2d0f766858..04d8e8cbf8313 100644 --- a/aten/src/ATen/native/TensorCompare.cpp +++ b/aten/src/ATen/native/TensorCompare.cpp @@ -13,6 +13,7 @@ #include #include #include +#include #include #ifndef AT_PER_OPERATOR_HEADERS @@ -22,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -71,6 +73,7 @@ #include #include +#include #include #endif @@ -128,17 +131,17 @@ const OptionalTensorRef max) { TensorIteratorConfig() \ .set_check_mem_overlap(true) \ .add_output(maybe_get_output()) \ - .add_input(self) \ + .add_const_input(self) \ .promote_inputs_to_common_dtype(true) \ .cast_common_dtype_to_outputs(true) \ .enforce_safe_casting_to_output(true) if (min && max) { - build(CLAMP_CONFIG().add_input(*min).add_input(*max)); + build(CLAMP_CONFIG().add_const_input(*min).add_const_input(*max)); } else if (min) { - build(CLAMP_CONFIG().add_input(*min)); + build(CLAMP_CONFIG().add_const_input(*min)); } else if (max) { - build(CLAMP_CONFIG().add_input(*max)); + build(CLAMP_CONFIG().add_const_input(*max)); } } @@ -440,6 +443,9 @@ Tensor _functional_assert_async_msg_cpu( return dep_token.clone(); } +void _print(c10::string_view s) { + std::cout << s << "\n"; +} // Sorting-based algorithm for isin(); used when the number of test elements is large. static void isin_sorting( @@ -455,17 +461,16 @@ static void isin_sorting( elements_flat = elements.ravel(); test_elements_flat = test_elements.ravel(); } else { - std::tie (elements_flat, unique_order) = at::_unique( + std::tie(elements_flat, unique_order) = at::_unique( elements, /*sorted=*/ false, /*return_inverse=*/ true); - std::tie (test_elements_flat, std::ignore) = at::_unique(test_elements, /*sorted=*/ false); + std::tie(test_elements_flat, std::ignore) = at::_unique(test_elements, /*sorted=*/ false); } // 2. Stable sort all elements, maintaining order indices to reverse the // operation. Stable sort is necessary to keep elements before test // elements within the sorted list. Tensor all_elements = at::cat({std::move(elements_flat), std::move(test_elements_flat)}); - Tensor sorted_elements, sorted_order; - std::tie (sorted_elements, sorted_order) = all_elements.sort( + auto [sorted_elements, sorted_order] = all_elements.sort( /*stable=*/ true, /*dim=*/ 0, /*descending=*/ false); // 3. Create a mask for locations of adjacent duplicate values within the @@ -503,17 +508,13 @@ Device out_device(Args&... inps){ Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) { - Tensor self_, other_, condition_; - if (self.dtype() != other.dtype()) { - auto result_type = at::native::result_type(self, other); - self_ = self.to(result_type); - other_ = other.to(result_type); - } else { - self_ = self; - other_ = other; - } + const auto result_type = at::native::result_type(self, other); + TORCH_CHECK(out.scalar_type() == result_type, "Expected out type to be ", result_type, " but got ", out.scalar_type()); + + auto self_ = self.scalar_type() != result_type ? self.to(result_type): self; + auto other_ = other.scalar_type() != result_type ? other.to(result_type): other; + auto condition_ = condition; auto device = out_device(condition, self_, other_); - condition_ = condition; if (device != at::kCPU) { // allow CPU scalars on non-cpu device if (condition.device() != device && condition.ndimension() == 0) { condition_ = condition.to(device); @@ -525,19 +526,18 @@ Tensor& where_self_out(const Tensor& condition, const Tensor& self, const Tensor other_ = other_.to(device); } } - if (condition.scalar_type() == ScalarType::Byte) { - TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead."); - } else { - TORCH_CHECK(condition.scalar_type() == ScalarType::Bool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition.scalar_type()); + if (condition_.scalar_type() == ScalarType::Byte) { + TORCH_WARN_ONCE("where received a uint8 condition tensor. This behavior is deprecated and will be removed in a future version of PyTorch. Use a boolean condition instead."); + condition_ = condition_.to(kBool); } - condition_ = condition_.scalar_type() == ScalarType::Byte ? condition_.to(ScalarType::Bool) : condition_; + TORCH_CHECK(condition_.scalar_type() == kBool, "where expected condition to be a boolean tensor, but got a tensor with dtype ", condition_.scalar_type()); // if there's still a device mismatch, let tensoriterator error out with it auto iter = at::TensorIteratorConfig() .check_all_same_dtype(false) .add_output(out) - .add_input(condition_) - .add_input(self_) - .add_input(other_) + .add_const_input(condition_) + .add_const_input(self_) + .add_const_input(other_) .build(); where_kernel(iter.device_type(), iter); return out; diff --git a/aten/src/ATen/native/TensorConversions.cpp b/aten/src/ATen/native/TensorConversions.cpp index ac3f6d4763ac8..c70da8334a5e9 100644 --- a/aten/src/ATen/native/TensorConversions.cpp +++ b/aten/src/ATen/native/TensorConversions.cpp @@ -254,15 +254,51 @@ Tensor _to_copy( // TODO: Use the dispatcher for this. // Currently there are unenumerated extensibility issues preventing this. - if (at::sparse_csr::is_sparse_compressed(self)) { + if (self.layout() == kSparse) { + TORCH_CHECK( + memory_format == MemoryFormat::Preserve, + "to(options): COO only supports memory format Preserve, but got ", memory_format, + " instead."); + if (options.device().is_meta()) { + return zeros_like(self, options); + } + auto indices = self._indices(); + const auto new_indices = at::native::to( + indices, + indices.scalar_type(), + c10::kStrided, + device, + pin_memory, + non_blocking, + true, // force copy since we are in _to_copy + memory_format); + const auto new_values = at::native::to( + self._values(), + dtype, + c10::kStrided, + device, + pin_memory, + non_blocking, + true, // force copy since we are in _to_copy + memory_format); + + return at::_sparse_coo_tensor_unsafe( + new_indices, + new_values, + self.sizes(), + options, self.is_coalesced()); + } else if (at::sparse_csr::is_sparse_compressed(self)) { TORCH_CHECK( memory_format == MemoryFormat::Preserve, "to(options): ", at::sparse_csr::layoutToString(self.layout()), " only supports memory format Preserve, but got ", memory_format, " instead."); - Tensor compressed_indices, plain_indices; - std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(self); + if (options.device().is_meta()) { + return zeros_like(self, options); + } + + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self); const auto new_values = at::native::to( self.values(), @@ -340,7 +376,7 @@ Tensor _to_copy( } // See Note [Explicit nullopt MemoryFormat argument] // TODO: empty_quantized does not work here. It raises an exception in CheckMemoryFormat.h prior to - // empty_affine_quantizd/_empty_per_channel_affine_quantized calls + // empty_affine_quantized/_empty_per_channel_affine_quantized calls // at::empty also does not work here because there is no proper at::empty support for quantized tensors // as it would return a quantized tensor with an UnknownQuantizer auto r = self.is_quantized() ? at::empty_like(self, memory_format) @@ -609,9 +645,7 @@ Tensor sparse_compressed_to_dense( auto compressed_rows = self.layout() == kSparseCsr || self.layout() == kSparseBsr; auto block_sparse = self.layout() == kSparseBsr || self.layout() == kSparseBsc; - Tensor compressed_indices; - Tensor plain_indices; - std::tie(compressed_indices, plain_indices) = + auto [compressed_indices, plain_indices] = sparse_csr::getCompressedPlainIndices(self); auto values = self.values(); @@ -656,7 +690,7 @@ Tensor sparse_compressed_to_dense( dense = dense.reshape(dense_reshaped_sizes); // Calculate batch, row and column indices for non-zeros in the - // sparse matrix, and use these to calculate correspoding indices + // sparse matrix, and use these to calculate corresponding indices // into the dense matrix reshaped as above. Then, update dense // matrix by adding sparse matrix values into elements with indices // calculated this way. @@ -1482,7 +1516,7 @@ void convert_indices_from_coo_to_csr_cpu( const Tensor& input, const int64_t size) { int64_t numel = input.numel(); - const input_t* data_in = input.data_ptr(); + const input_t* data_in = input.const_data_ptr(); output_t* data_out = result.data_ptr(); if (numel == 0) { @@ -1528,7 +1562,7 @@ void convert_indices_from_csr_to_coo_cpu( batch_indices.copy_(at::sparse::full_coo_indices(crow_indices.sizes().slice(0, batch_ndim), crow_indices.options()) .repeat_interleave(nnz, 1)); } - const input_t* crow_indices_data_in = crow_indices_->data_ptr(); + const input_t* crow_indices_data_in = crow_indices_->const_data_ptr(); TORCH_INTERNAL_ASSERT(indices.is_contiguous()); auto row0 = indices.select(0, transpose ? batch_ndim + 1 : batch_ndim + 0); auto row1 = indices.select(0, transpose ? batch_ndim + 0 : batch_ndim + 1); @@ -1836,8 +1870,7 @@ Tensor sparse_compressed_to_sparse(const Tensor& self, const int64_t sparse_dim) _to_sparse_check_arguments("sparse_compressed_to_sparse", self, sparse_dim); Layout layout = self.layout(); - Tensor compressed_indices, plain_indices; - std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(self); + auto [compressed_indices, plain_indices] = at::sparse_csr::getCompressedPlainIndices(self); Tensor values; Tensor indices = at::_convert_indices_from_csr_to_coo(compressed_indices, plain_indices, false, (layout == kSparseCsc || layout == kSparseBsc)); diff --git a/aten/src/ATen/native/TensorDimApply.h b/aten/src/ATen/native/TensorDimApply.h index 65d90f6fda1f5..4d52446446316 100644 --- a/aten/src/ATen/native/TensorDimApply.h +++ b/aten/src/ATen/native/TensorDimApply.h @@ -10,7 +10,7 @@ void tensor_dim_apply3(const Tensor& self, Tensor& values, Tensor& indices, int6 int ndims = self.dim(); int tensor_dim_apply_has_finished = 0; std::vector counter(ndims, 0); - T1* self_data = self.data_ptr(); + const T1* self_data = self.const_data_ptr(); T1* values_data = values.data_ptr(); T2* indices_data = indices.data_ptr(); int64_t self_stride = self.stride(dim); diff --git a/aten/src/ATen/native/TensorFactories.cpp b/aten/src/ATen/native/TensorFactories.cpp index 2cb92baf79f9b..c8fddc3756353 100644 --- a/aten/src/ATen/native/TensorFactories.cpp +++ b/aten/src/ATen/native/TensorFactories.cpp @@ -35,6 +35,7 @@ #include #include #include +#include #include #include #include @@ -214,8 +215,8 @@ Tensor& complex_out(const Tensor& real, const Tensor& imag, Tensor& result) { complex_check_dtype(result, real, imag); auto iter = TensorIteratorConfig() .add_output(result) - .add_input(real) - .add_input(imag) + .add_const_input(real) + .add_const_input(imag) .check_all_same_dtype(false) .build(); complex_stub(iter.device_type(), iter); @@ -234,8 +235,8 @@ Tensor& polar_out(const Tensor& abs, const Tensor& angle, Tensor& result) { complex_check_dtype(result, abs, angle); auto iter = TensorIteratorConfig() .add_output(result) - .add_input(abs) - .add_input(angle) + .add_const_input(abs) + .add_const_input(angle) .check_all_same_dtype(false) .build(); polar_stub(iter.device_type(), iter); @@ -277,8 +278,8 @@ Tensor empty_names( } TORCH_CHECK(options.layout() == Layout::Strided, "NYI: named tensors only support strided layout"); - TORCH_CHECK(options.device().is_cpu() || options.device().is_cuda() || options.device().is_privateuseone(), - "NYI: named tensors only support CPU, CUDA or ", c10::get_privateuse1_backend(), " tensors."); + TORCH_CHECK(options.device().is_cpu() || options.device().is_cuda() || options.device().is_xpu() || options.device().is_privateuseone(), + "NYI: named tensors only support CPU, CUDA, XPU or ", c10::get_privateuse1_backend(), " tensors."); auto result = at::empty(size, options, optional_memory_format); internal_set_names_inplace(result, names); return result; @@ -368,10 +369,9 @@ Tensor& empty_out(IntArrayRef size, // Some scalar types in CAST_OP have no declarations, they may be unused in Pytorch. // But we keep them and ignore the warning here until verified in the future. -#pragma clang diagnostic push -#pragma clang diagnostic ignored "-Wmissing-prototypes" +C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wmissing-prototypes") AT_FORALL_SCALAR_TYPES_AND3(Bool, Half, BFloat16, DEFINE_CAST_OP) -#pragma clang diagnostic pop +C10_DIAGNOSTIC_POP() #undef DEFINE_CAST_OP @@ -1339,16 +1339,16 @@ Tensor _efficientzerotensor(IntArrayRef size, return out; } -Tensor _efficientzerotensor_meta(IntArrayRef size, - c10::optional dtype, - c10::optional layout, - c10::optional device, - c10::optional pin_memory) { +Tensor _efficientzerotensor_meta_symint(SymIntArrayRef size, + c10::optional dtype, + c10::optional layout, + c10::optional device, + c10::optional pin_memory) { auto device_ = device_or_default(device); auto allocator = at::native::ZeroTensorAllocator(device_); auto dtype_ = dtype_or_default(dtype); auto zero_ks = at::DispatchKeySet(c10::DispatchKey::Meta) | at::DispatchKeySet(c10::DispatchKey::ZeroTensor); - auto out = at::detail::empty_generic(size, &allocator, zero_ks, dtype_, c10::nullopt); + auto out = at::detail::empty_generic_symint(size, &allocator, zero_ks, dtype_, c10::nullopt); return out; } @@ -1391,11 +1391,29 @@ Tensor zeros_like( if (self.is_sparse()) { res.sparse_resize_and_clear_( self.sizes(), self.sparse_dim(), self.dense_dim()); + } else if (at::sparse_csr::is_sparse_compressed(self)) { + res.sparse_resize_and_clear_( + self.sizes(), self.sizes().size() - self.dense_dim(), self.dense_dim()); } else { res.sparse_resize_and_clear_(self.sizes(), self.sizes().size(), 0); } res._coalesced_(true); + return res; + } else if (at::sparse_csr::is_sparse_compressed(options.layout())) { + int64_t nnz = 0; + int64_t dense_dim = (self.layout() == kStrided ? self.dim() - 2: self.dense_dim()); + DimVector blocksize{}; + if (self.layout() == kSparseBsr || self.layout() == kSparseBsc) { + blocksize.append(at::sparse_csr::getBlockSize(self)); + } + ScalarType index_dtype = at::sparse_csr::getIndexDtype(self); + auto res = at::native::sparse_compressed_tensor_with_dims( + nnz, dense_dim, self.sizes(), blocksize, index_dtype, + typeMetaToScalarType(options.dtype()), options.layout(), options.device(), options.pinned_memory()); + Tensor compressed_indices, plain_indices; + std::tie(compressed_indices, plain_indices) = at::sparse_csr::getCompressedPlainIndices(res); + compressed_indices.zero_(); return res; } auto result = at::empty_like(self, options, optional_memory_format); diff --git a/aten/src/ATen/native/TensorFactories.h b/aten/src/ATen/native/TensorFactories.h index 26b5739791114..f9b2893d768a9 100644 --- a/aten/src/ATen/native/TensorFactories.h +++ b/aten/src/ATen/native/TensorFactories.h @@ -124,12 +124,13 @@ struct ZeroTensorAllocator final : public at::Allocator { static void deleter(void* const pointer) { TORCH_INTERNAL_ASSERT(!pointer); } - DataPtr allocate(const size_t /*nbytes*/) const override { + DataPtr allocate(const size_t /*nbytes*/) override { return {nullptr, nullptr, &deleter, device_}; } DeleterFnPtr raw_deleter() const override { return deleter; } + void copy_data(void* dest, const void* src, std::size_t count) const final {} at::Device device_; }; diff --git a/aten/src/ATen/native/TensorIteratorDynamicCasting.h b/aten/src/ATen/native/TensorIteratorDynamicCasting.h index b042ebae27bfc..a2bdd6eb13e4b 100644 --- a/aten/src/ATen/native/TensorIteratorDynamicCasting.h +++ b/aten/src/ATen/native/TensorIteratorDynamicCasting.h @@ -3,12 +3,11 @@ #include #include #include -#include #include #include -// This file includes utilties for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h. +// This file includes utilities for dynamic_casting done by TensorIterator, see CUDALoops.cuh and Loops.h. // dynamic_casting handles when the types expected by the iterator do not match the types of the arguments // to the function that is being called. diff --git a/aten/src/ATen/native/TensorIteratorReduce.cpp b/aten/src/ATen/native/TensorIteratorReduce.cpp index 7164b89554c59..9c4e4e9459d46 100644 --- a/aten/src/ATen/native/TensorIteratorReduce.cpp +++ b/aten/src/ATen/native/TensorIteratorReduce.cpp @@ -59,7 +59,7 @@ static void two_pass_reduction(TensorIteratorBase& iter, loop2d_t loop) { auto shape = first_reduce.shape(); auto strides = first_reduce.get_strides(); - // Bump output ptr so each thread has its own ouput slice + // Bump output ptr so each thread has its own output slice auto base_ptrs = first_reduce.get_base_ptrs(); base_ptrs[0] += buffer_stride * thread_num; diff --git a/aten/src/ATen/native/TensorShape.cpp b/aten/src/ATen/native/TensorShape.cpp index 05bd5f4cafa2a..a99e6e3a50c11 100644 --- a/aten/src/ATen/native/TensorShape.cpp +++ b/aten/src/ATen/native/TensorShape.cpp @@ -36,9 +36,11 @@ #include #include #else +#include #include #include #include +#include #include #include #include @@ -151,6 +153,7 @@ #include #include #include +#include #include #include #include @@ -409,7 +412,7 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st result.unsafeGetTensorImpl()->set_sizes_and_strides(size, stride, storage_offset); // Matches maybe_resize_storage_cpu no-numel behavior - if (result.sym_numel() != 0) { + if (TORCH_GUARD_SIZE_OBLIVIOUS(result.sym_numel().sym_ne(0))) { // maybe_resize_storage_cpu can handle no storage exists at all but // that should never be the case here TORCH_INTERNAL_ASSERT(storage); @@ -418,9 +421,19 @@ Tensor& set_storage_meta__symint(Tensor& result, Storage storage, c10::SymInt st // it. TODO: Actually this might not quite be correct if we use special // pointers to track whether or not fake cuda tensors are pinned or not const auto itemsize = result.dtype().itemsize(); - c10::SymInt size_bytes = at::detail::computeStorageNbytes( + c10::SymInt new_size_bytes = at::detail::computeStorageNbytes( size, stride, itemsize, std::move(storage_offset)); - storage.set_nbytes(std::move(size_bytes)); + // TODO: When there are unbacked SymInts, we unconditionally skip the + // setter. This is technically wrong, but we cannot conveniently test + // the real condition in many cases, because a lot of people are using + // set_ just to swizzle metadata on a tensor, they didn't actually want + // to see if they need to resize the storage. + // + // The old behavior was to unconditionally set_nbytes, but I think not + // setting it is more safe. + if (new_size_bytes.has_hint() && storage.sym_nbytes().has_hint() && TORCH_GUARD_SIZE_OBLIVIOUS(new_size_bytes.sym_gt(storage.sym_nbytes()))) { + storage.set_nbytes(std::move(new_size_bytes)); + } } return result; } @@ -508,7 +521,7 @@ Tensor sparse_broadcast_to(const Tensor& self, IntArrayRef size) { } } // to_broadcast conserves is_coalesced property iff only the last - // sparse dimensions are expaned. Possible expansion of dense + // sparse dimensions are expanded. Possible expansion of dense // dimensions can be discarded as it does not affect the is_coalesce // property. bool is_coalesced = self.dim()==0 || (self.is_coalesced() && (max_unchanged_dim < min_broadcast_dim || min_broadcast_dim == -1)); @@ -553,7 +566,7 @@ static void fastCatOutDim0(const Tensor& out, const MaterializedITensorListRef& for (const Tensor& input : inputs) { TORCH_CHECK(outBytes >= totalBytes); if (input.nbytes() > 0) { - std::memcpy(dataPtr + totalBytes, input.data_ptr(), input.nbytes()); + std::memcpy(dataPtr + totalBytes, input.const_data_ptr(), input.nbytes()); } totalBytes += input.nbytes(); } @@ -608,7 +621,7 @@ TORCH_IMPL_FUNC(cat_out_cpu) .set_check_mem_overlap(false) .resize_outputs(false) .add_output(result_slice) - .add_input(source_slice) + .add_const_input(source_slice) .enforce_safe_casting_to_output(true) .build(); @@ -616,10 +629,10 @@ TORCH_IMPL_FUNC(cat_out_cpu) if (cat_should_skip_tensor(tensor)) { continue; } - auto source_data = static_cast(tensor.data_ptr()); + auto source_data = static_cast(tensor.const_data_ptr()); auto result_data = static_cast(result_slice_data) + offset * result_stride_bytes; iter.unsafe_replace_operand(0, result_data); - iter.unsafe_replace_operand(1, source_data); + iter.unsafe_replace_operand(1, const_cast(source_data)); copy_stub(iter.device_type(), iter, false); offset += slice_dim_size; } @@ -635,7 +648,7 @@ TORCH_IMPL_FUNC(cat_out_cpu) .set_check_mem_overlap(false) // Already checked above .resize_outputs(false) .add_output(result_slice) - .add_input(tensor) + .add_const_input(tensor) .promote_inputs_to_common_dtype(true) .cast_common_dtype_to_outputs(true) .enforce_safe_casting_to_output(true) @@ -1003,7 +1016,7 @@ std::vector tensor_split(const Tensor& self, const Tensor& tensor_indice int64_t sections = tensor_indices_or_sections.item(); return self.tensor_split(sections, dim); } else { - auto indices_data = tensor_indices_or_sections.data_ptr(); + auto indices_data = tensor_indices_or_sections.const_data_ptr(); auto stride = tensor_indices_or_sections.stride(0); auto numel = tensor_indices_or_sections.numel(); std::vector indices(numel); @@ -1343,22 +1356,22 @@ Tensor& narrow_copy_dense_cpu_out( return output; } - char* src_bytes = static_cast(self_contig->data_ptr()); + const char* src_bytes = static_cast(self_contig->const_data_ptr()); char* dst_bytes = static_cast(output.data_ptr()); size_t src_block_size_bytes = itemsize * src_block_size; size_t dst_block_size_bytes = itemsize * dst_block_size; size_t src_offset = unit * start; - char* src_offset_bytes = src_bytes + itemsize * src_offset; + const char* src_offset_bytes = src_bytes + itemsize * src_offset; char* dst_offset_bytes = dst_bytes; for (const auto i : c10::irange(num_blocks)) { - char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes; + const char* local_src_offset_bytes = src_offset_bytes + i * src_block_size_bytes; char* local_dst_offset_bytes = dst_offset_bytes + i * dst_block_size_bytes; TORCH_INTERNAL_ASSERT_DEBUG_ONLY( - static_cast(local_src_offset_bytes + dst_block_size_bytes) <= - static_cast(src_bytes + src_nbytes)); + static_cast(local_src_offset_bytes + dst_block_size_bytes) <= + static_cast(src_bytes + src_nbytes)); TORCH_INTERNAL_ASSERT_DEBUG_ONLY( static_cast(local_dst_offset_bytes + dst_block_size_bytes) <= static_cast(dst_bytes + dst_nbytes)); @@ -1443,16 +1456,12 @@ static _permute_size_stride_estimation(const Tensor& self, IntArrayRef dims) { } Tensor permute(const Tensor& self, IntArrayRef dims) { - DimVector new_sizes, new_strides; - std::vector _; - std::tie(new_sizes, new_strides, _) = _permute_size_stride_estimation(self, dims); + auto [new_sizes, new_strides, _] = _permute_size_stride_estimation(self, dims); return self.as_strided(new_sizes, new_strides); } Tensor permute_sparse_coo(const Tensor& self, IntArrayRef dims) { - DimVector new_sizes, _; - std::vector wrapped_dims; - std::tie(new_sizes, _, wrapped_dims) = _permute_size_stride_estimation(self, dims); + auto [new_sizes, _, wrapped_dims] = _permute_size_stride_estimation(self, dims); const auto ndim = self.dim(); const auto sparse_ndim = self.sparse_dim(); @@ -1826,7 +1835,7 @@ Tensor select_symint(const Tensor& self, int64_t dim, c10::SymInt index) { auto size = self.sym_sizes()[dim]; // Note: `size < -index` is not equivalent to `size <= -1 - index` if index is INT64_MIN // For std::numeric_limits::min() result of unary minus is undefined by the standard - // but in practice is equal to self. On the other hand, indexing wraping is valid for all + // but in practice is equal to self. On the other hand, indexing wrapping is valid for all // negative int64_t values, as x[INT64_MIN] is the same as x[INT64_MAX] if (size <= -1 - index || size <= index) { if (self.has_names() && self.names()[dim] != Dimname::wildcard()) { @@ -1877,7 +1886,7 @@ Tensor select_backward_symint(const Tensor& grad, c10::SymIntArrayRef input_size Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& index) { /* Algorithm: - index - a 1-D tensor of indicies with shape (n,) + index - a 1-D tensor of indices with shape (n,) self - sparse tensor, its shape is sizes = sparse_shape + dense_shape indices - 2-D tensor of indices, shape is (sparse_dims, nnz) values - (1+len(dense_shape))-D tensor of values, shape is (nnz,) + dense_shape @@ -2022,15 +2031,13 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in return std::make_tuple(dim_indices, at::arange(nnz, dim_indices.options()), nneg_index); } else { - Tensor sorted_dim_indices, sorted_dim_indices_idx; - std::tie(sorted_dim_indices, sorted_dim_indices_idx) = dim_indices.sort(); + auto [sorted_dim_indices, sorted_dim_indices_idx] = dim_indices.sort(); return std::make_tuple(sorted_dim_indices, sorted_dim_indices_idx, nneg_index); } } // sort nneg_index to binary search into it else { - Tensor sorted_nneg_index, sorted_nneg_index_idx; - std::tie(sorted_nneg_index, sorted_nneg_index_idx) = nneg_index.sort(); + auto [sorted_nneg_index, sorted_nneg_index_idx] = nneg_index.sort(); return std::make_tuple(sorted_nneg_index, sorted_nneg_index_idx, dim_indices); } }(); @@ -2061,7 +2068,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in // fill in src_int_idx, sorted_int_idx, int_counts { const auto sorted_len = sorted.numel(); - const auto* ptr_sorted = sorted.data_ptr(); + const auto* ptr_sorted = sorted.const_data_ptr(); const auto* ptr_sorted_start = ptr_sorted; const auto* ptr_sorted_end = ptr_sorted + sorted_len; @@ -2071,7 +2078,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr(); auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr(); auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr(); - const auto* ptr_src = src.data_ptr() + start; + const auto* ptr_src = src.const_data_ptr() + start; for (const auto i : c10::irange(start, end)) { const auto src_val = *ptr_src++; @@ -2124,14 +2131,14 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in auto* ptr_selected_src = selected_src.data_ptr(); const auto thread_offsets = compressed_int_counts.cumsum(0).sub_(compressed_int_counts); - const auto* ptr_sorted_idx = sorted_idx.data_ptr(); + const auto* ptr_sorted_idx = sorted_idx.const_data_ptr(); at::parallel_for(0, n_threads_src, 1, [&](int64_t tid, C10_UNUSED int64_t _) { const auto start = tid * chunk_size_src; const auto end = std::min(start + chunk_size_src, src_len); const auto tid_offset = thread_offsets.const_data_ptr()[tid]; - const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).data_ptr(); - const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).data_ptr(); - const auto* ptr_tid_int_counts = int_counts.select(0, tid).data_ptr(); + const auto* ptr_tid_src_int_idx = src_int_idx.select(0, tid).const_data_ptr(); + const auto* ptr_tid_sorted_int_idx = sorted_int_idx.select(0, tid).const_data_ptr(); + const auto* ptr_tid_int_counts = int_counts.select(0, tid).const_data_ptr(); auto* ptr_tid_selected_sorted = ptr_selected_sorted + tid_offset; auto* ptr_tid_selected_src = ptr_selected_src + tid_offset; @@ -2166,7 +2173,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in bool run_in_parallel = true) -> Tensor { auto cidx = at::empty({len + 1}, idx.options()); - const auto* ptr_idx = idx.data_ptr(); + const auto* ptr_idx = idx.const_data_ptr(); auto* ptr_cidx = cidx.data_ptr(); const auto idx_len = idx.numel(); @@ -2205,7 +2212,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in } else { auto* ptr_counts = counts.data_ptr(); - const auto* ptr_vals = t.data_ptr(); + const auto* ptr_vals = t.const_data_ptr(); for (C10_UNUSED const auto _ : c10::irange(t.numel())) { ++ptr_counts[*ptr_vals++]; } @@ -2313,10 +2320,10 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in const auto src_idx_len = src_intersection_offsets.const_data_ptr()[size - 1]; auto src_idx = at::empty({src_idx_len}, src.options()); - const auto* ptr_src = src.data_ptr(); - const auto* ptr_intersection_counts = intersection_counts.data_ptr(); - const auto* ptr_src_intersection_counts = src_intersection_counts.data_ptr(); - const auto* ptr_src_intersection_offsets = src_intersection_offsets.data_ptr(); + const auto* ptr_src = src.const_data_ptr(); + const auto* ptr_intersection_counts = intersection_counts.const_data_ptr(); + const auto* ptr_src_intersection_counts = src_intersection_counts.const_data_ptr(); + const auto* ptr_src_intersection_offsets = src_intersection_offsets.const_data_ptr(); auto* ptr_src_idx = src_idx.data_ptr(); const auto src_len = src.numel(); @@ -2329,9 +2336,9 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in const auto end = std::min(start + chunk_size, src_len); auto* ptr_src_tid = ptr_src + start; const auto* ptr_src_counts_per_thread - = src_counts_per_thread.select(0, tid).data_ptr(); + = src_counts_per_thread.select(0, tid).const_data_ptr(); const auto* ptr_src_offset_counts_per_thread - = src_offset_counts_per_thread.select(0, tid).data_ptr(); + = src_offset_counts_per_thread.select(0, tid).const_data_ptr(); auto tid_counts = at::zeros({size}, src.options()); auto* ptr_tid_counts = tid_counts.data_ptr(); @@ -2356,8 +2363,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in return std::make_tuple(src_idx, src_idx_offsets); }(); - Tensor idx_selected, src_selected; - std::tie(idx_selected, src_selected) = [&]( + auto [idx_selected, src_selected] = [&]( int64_t grain_size = at::internal::GRAIN_SIZE ) -> std::tuple { const auto thread_offset = [&]() { @@ -2366,16 +2372,16 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in auto counts_per_thread = idx_counts_per_thread.mul_(src_counts).sum(-1); return counts_per_thread.cumsum(0).sub_(counts_per_thread); }(); - const auto* ptr_thread_offset = thread_offset.data_ptr(); + const auto* ptr_thread_offset = thread_offset.const_data_ptr(); auto idx_selected = at::empty({res_len}, idx.options()); auto src_selected = at::empty({res_len}, src.options()); - const auto* ptr_idx = idx.data_ptr(); - const auto* ptr_src_counts = src_counts.data_ptr(); - const auto* ptr_intersection_counts = intersection_counts.data_ptr(); - const auto* ptr_src_idx = src_idx.data_ptr(); - const auto* ptr_src_idx_offsets = src_idx_offsets.data_ptr(); + const auto* ptr_idx = idx.const_data_ptr(); + const auto* ptr_src_counts = src_counts.const_data_ptr(); + const auto* ptr_intersection_counts = intersection_counts.const_data_ptr(); + const auto* ptr_src_idx = src_idx.const_data_ptr(); + const auto* ptr_src_idx_offsets = src_idx_offsets.const_data_ptr(); auto* ptr_idx_selected = idx_selected.data_ptr(); auto* ptr_src_selected = src_selected.data_ptr(); @@ -2428,8 +2434,7 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in const auto get_result_small_nnz_small_index = [&]() -> Tensor { const auto dim_indices_in_inner_loop = nnz >= index_len; - Tensor outer, inner; - std::tie(outer, inner) = [&]() -> std::tuple { + auto [outer, inner] = [&]() -> std::tuple { if (dim_indices_in_inner_loop) { return std::make_tuple(nneg_index, dim_indices); } @@ -2438,8 +2443,8 @@ Tensor index_select_sparse_cpu(const Tensor& self, int64_t dim, const Tensor& in } }(); - const auto* ptr_outer = outer.data_ptr(); - const auto* ptr_inner = inner.data_ptr(); + const auto* ptr_outer = outer.const_data_ptr(); + const auto* ptr_inner = inner.const_data_ptr(); // NOTE: if very critical, replace std::vector with // a data structure that operates on stack up to some limit. auto outer_selected_idx = std::vector(); @@ -2559,6 +2564,17 @@ Tensor slice( return result; } +Tensor slice_inverse_symint( + const Tensor& self, + const Tensor& base, + int64_t /* dim */, + c10::optional /* start */, + c10::optional /* end */, + SymInt /* step */) { + // assume self has enough to storage to be viewed with base's metadata + return self.as_strided_symint(base.sym_sizes(), base.sym_strides(), base.sym_storage_offset()); +} + Tensor slice_backward(const Tensor& grad, IntArrayRef input_sizes, int64_t dim, int64_t start, int64_t end, int64_t step) { auto grad_input = at::zeros(input_sizes, grad.options()); grad_input.slice(dim, start, end, step).copy_(grad); @@ -2719,6 +2735,38 @@ static void check_stack_inputs(TensorList tensors, int64_t dim) { } } +// Pads each tensor on `dim`-th dimension such that padded_dim % num_chunks == 0. +static std::vector _pad_chunk(TensorList tensors, int64_t dim, int64_t num_chunks) { + auto num_tensors = tensors.size(); + std::vector padded_tensors; + padded_tensors.reserve(num_tensors); + for (const auto & tensor : tensors) { + auto tensor_size = tensor.sizes(); + std::vector padded_size(tensor_size.vec()); + padded_size[dim] = (tensor_size[dim] + num_chunks - 1) / num_chunks * num_chunks; + Tensor padded_tensor = tensor; + if (padded_size != tensor_size) { + padded_tensor = tensor.new_zeros(padded_size); + padded_tensor.narrow(dim, 0, tensor_size[dim]).copy_(tensor); + } + std::vector view_sizes(tensor_size.begin(), tensor_size.begin()+dim); + view_sizes.insert(view_sizes.end(), {num_chunks, -1}); + padded_tensors.push_back(padded_tensor.view(view_sizes)); + } + return padded_tensors; +} + +Tensor _chunk_cat(TensorList tensors, int64_t dim, int64_t num_chunks) { + auto wrapped_dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks); + return at::cat(_pad_chunk(tensors, wrapped_dim, num_chunks), wrapped_dim+1); +} + +Tensor& _chunk_cat_out(TensorList tensors, int64_t dim, int64_t num_chunks, Tensor& out) { + auto wrapped_dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks); + at::cat_out(out, _pad_chunk(tensors, wrapped_dim, num_chunks), wrapped_dim+1); + return out; +} + // TODO(msubkhankulov): refactor to use _stack Tensor stack(TensorList tensors, int64_t dim) { TORCH_CHECK(!tensors.empty(), @@ -2928,11 +2976,11 @@ Tensor & transpose_(Tensor & self, int64_t dim0, int64_t dim1) { } // Sparse COO is an exceptional sparse format as it allows transpose - // to be a view operation which is a convinient property for + // to be a view operation which is a convenient property for // in-place operations. For other sparse formats, the in-place // transpose would not be possible without shuffling the specified // values. So we don't support this as it would defeat the purpose - // of in-place opeations of being memory-efficient. + // of in-place opreations of being memory-efficient. if (self.is_sparse()) { return sparse_transpose_(self, dim0, dim1); } @@ -3199,13 +3247,11 @@ inferUnsqueezeGeometry(const Tensor& tensor, int64_t dim) { // dim is present if squeezing a single dimension and absent if squeezing all dimensions Tensor squeeze_qtensor(const Tensor& self, c10::OptionalIntArrayRef dims) { auto quantizer = get_qtensorimpl(self)->quantizer(); - SymDimVector sizes; - SymDimVector strides; const auto ndim = self.dim(); auto mask = dims.has_value() ? dim_list_to_bitset(dims, self.dim()) : std::bitset((1ull << self.dim()) - 1); - std::tie(sizes, strides) = inferSqueezeGeometry(self, mask); + auto [sizes, strides] = inferSqueezeGeometry(self, mask); if (quantizer->qscheme() == QScheme::PER_CHANNEL_AFFINE) { const auto* per_channel_quantizer = static_cast(quantizer.get()); auto axis = per_channel_quantizer->axis(); @@ -3417,6 +3463,10 @@ Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim) { } Tensor flatten(const Tensor& self, int64_t start_dim, int64_t end_dim, Dimname out_dim) { + start_dim = maybe_wrap_dim(start_dim, self.dim()); + end_dim = maybe_wrap_dim(end_dim, self.dim()); + TORCH_CHECK(start_dim <= end_dim, "flatten() has invalid args: start_dim cannot come after end_dim"); + auto outnames = self.names().vec(); outnames.erase(outnames.begin() + start_dim, outnames.begin() + end_dim + 1); outnames.insert(outnames.begin() + start_dim, out_dim); @@ -4012,6 +4062,13 @@ void split_with_sizes_copy_out(const at::Tensor & self, at::IntArrayRef split_si TORCH_CHECK(out.size() == tmp.size(), "split_with_sizes_copy_out() expected an out= argument of size ", tmp.size(), ", got size ", out.size()); for (const auto i : c10::irange(out.size())) { + if (resize_output_check(out[i], tmp[i].sizes())) { + out[i].resize_(tmp[i].sizes()); + } + TORCH_CHECK(out[i].dtype() == tmp[i].dtype(), + "Expected out tensor to have dtype ", tmp[i].dtype(), ", but got ", out[i].dtype(), " instead"); + TORCH_CHECK(out[i].device() == tmp[i].device(), + "Expected out tensor to have device ", tmp[i].device(), ", but got ", out[i].device(), " instead"); out[i].copy_(tmp[i]); } } @@ -4025,11 +4082,13 @@ void unbind_copy_int_out(const at::Tensor & self, int64_t dim, at::TensorList o } } -int64_t sparse_dim_strided(const at::Tensor& self) { +int64_t sparse_dim_default(const Tensor& self) { + TORCH_CHECK(self.layout() == kStrided, "sparse_dim expected sparse or strided tensor layout but got ", self.layout()); return 0; } -int64_t dense_dim_strided(const at::Tensor& self) { +int64_t dense_dim_default(const Tensor& self) { + TORCH_CHECK(self.layout() == kStrided, "dense_dim expected sparse or strided tensor layout but got ", self.layout()); return self.dim(); } diff --git a/aten/src/ATen/native/TensorShape.h b/aten/src/ATen/native/TensorShape.h index 1c84abb822aba..c35023d076e73 100644 --- a/aten/src/ATen/native/TensorShape.h +++ b/aten/src/ATen/native/TensorShape.h @@ -8,7 +8,7 @@ namespace at::native { TORCH_API at::Tensor clone_preserve_strides(const at::Tensor& self); inline bool cat_should_skip_tensor(const Tensor& t) { - return t.numel() == 0 && t.dim() == 1; + return t.sym_numel() == 0 && t.dim() == 1; } // Check to see if the shape of tensors is compatible @@ -55,4 +55,51 @@ inline int64_t get_num_splits(const Tensor& self, int64_t split_size, int64_t di return num_splits; } +inline bool have_same_ndims(TensorList tensors) { + auto ndim = tensors[0].dim(); + for (const auto tensor_idx : c10::irange(tensors.size())) { + if(tensors[tensor_idx].dim() != ndim) { + return false; + } + } + return true; +} + +inline void leading_dimension_matches(TensorList tensors, int64_t dim) { + auto tensor_zero_size = tensors[0].sizes(); + std::vector leading_dim_sizes(tensor_zero_size.begin(), tensor_zero_size.begin() + dim); + for (const auto i : c10::irange(tensors.size())) { + at::Tensor tensor = tensors[i]; + for(const auto j : c10::irange(dim)) { + TORCH_CHECK( + tensor.size(j) == leading_dim_sizes[j], + "_chunk_cat expects same sizes of 0,...,dim-1 dimensions for all tensors" + ); + } + } +} + +inline int64_t preprocess_chunk_cat_inputs(TensorList tensors, int64_t dim, int64_t num_chunks) { + TORCH_CHECK(num_chunks >= 1, "_chunk_cat expects positive num_chunks"); + TORCH_CHECK(!tensors.empty(), + "_chunk_cat expects a non-empty input tensor list"); + auto expected_dtype = tensors[0].dtype(); + auto expected_device = tensors[0].device(); + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(tensors[i].numel() > 0, "_chunk_cat expects non-empty tensor"); + TORCH_CHECK(tensors[i].dtype() == expected_dtype, "_chunk_cat expects all input tensors with the same dtype"); + TORCH_CHECK(tensors[i].device() == expected_device, "_chunk_cat expects all inputs tensors on the same device"); + } + if (have_same_ndims(tensors)) { + dim = maybe_wrap_dim(dim, tensors[0].dim()); + } else { + TORCH_CHECK(dim >= 0, "_chunk_cat expects non-negative dim when input tensors have different ndims") + for(const auto i : c10::irange(tensors.size())) { + TORCH_CHECK(dim < tensors[i].ndimension(), "_chunk_cat expects dim < ndim for all input tensors"); + } + } + leading_dimension_matches(tensors, dim); + return dim; +} + } // namespace at::native diff --git a/aten/src/ATen/native/TensorTransformations.cpp b/aten/src/ATen/native/TensorTransformations.cpp index c2f54c5b66574..5a7c3a6de965f 100644 --- a/aten/src/ATen/native/TensorTransformations.cpp +++ b/aten/src/ATen/native/TensorTransformations.cpp @@ -63,8 +63,8 @@ Tensor flip(const Tensor& self, IntArrayRef dims) { .check_all_same_dtype(false) .declare_static_dtype_and_device(self.scalar_type(), self.device()) .add_output(out_tensor) - .add_input(self) - .add_input(restrided_self) + .add_const_input(self) + .add_const_input(restrided_self) .build(); auto* data = reinterpret_cast(iter.data_ptr(0)); diff --git a/aten/src/ATen/native/TestOps.cpp b/aten/src/ATen/native/TestOps.cpp index 6dd2d1aa55517..e2fce123035ba 100644 --- a/aten/src/ATen/native/TestOps.cpp +++ b/aten/src/ATen/native/TestOps.cpp @@ -4,6 +4,7 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -13,6 +14,7 @@ #include #include #include +#include #include #include #include @@ -111,6 +113,21 @@ Tensor _test_check_tensor(const Tensor& self) { return self.clone(); } +Tensor _test_parallel_materialize(const Tensor& self, int64_t num_parallel, bool skip_first) { + at::parallel_for(0, num_parallel, 1, [&](int64_t begin, int64_t end){ + // NOTE: skip_first is meant to avoid triggering the materialization from + // the first thread, to ensure that the subthreads throw the error + // correctly. On some platforms, the first thread is the main thread and it + // begins executing the loop function much earlier than the subthreads. + if (skip_first && begin == 0 && end == 1) { + return; + } else { + self.mutable_data_ptr(); + } + }); + return self; +} + } // namespace at::native namespace at::functionalization { diff --git a/aten/src/ATen/native/TopKImpl.h b/aten/src/ATen/native/TopKImpl.h index a9790e892c642..0a11f5f408753 100644 --- a/aten/src/ATen/native/TopKImpl.h +++ b/aten/src/ATen/native/TopKImpl.h @@ -36,14 +36,14 @@ void topk_impl_loop( TensorAccessor mode_indices( reinterpret_cast(data[1] + i * strides[1]), &k, &mode_indices_stride); - TensorAccessor tmp_values( + TensorAccessor tmp_values( reinterpret_cast(data[2] + i * strides[2]), &dim_size, &tmp_values_stride); - auto n = dim_size; - auto use_partial_sort = k * 64 <= n; + auto n_2 = dim_size; + auto use_partial_sort = k * 64 <= n_2; - for (const auto j : c10::irange(n)) { + for (const auto j : c10::irange(n_2)) { queue[j].first = tmp_values[j]; queue[j].second = j; } diff --git a/aten/src/ATen/native/TriangularOps.cpp b/aten/src/ATen/native/TriangularOps.cpp index 66147b441fd74..9cb75a0eccf4e 100644 --- a/aten/src/ATen/native/TriangularOps.cpp +++ b/aten/src/ATen/native/TriangularOps.cpp @@ -41,7 +41,7 @@ namespace { template void apply_triu_tril_single( scalar_t* result, - scalar_t* self, + const scalar_t* self, bool inplace, int64_t k, int64_t n, @@ -86,7 +86,7 @@ template void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int64_t k, bool upper) { auto n = self.size(-2); auto m = self.size(-1); - auto self_data = self.data_ptr(); + auto self_data = self.const_data_ptr(); auto self_stride = (self.dim() > 2 && self.stride(-3) > 0) ? self.stride(-3) : 1; auto batchsize = batchCountTrilTriu(result); auto self_row_stride = self.stride(-2); @@ -107,7 +107,7 @@ void apply_triu_tril(const Tensor& result, const Tensor& self, bool inplace, int parallel_for(0, batchsize, 0, [&](int64_t start, int64_t end) { for (const auto b : c10::irange(start, end)) { - scalar_t* self_batch = &self_data[b * self_stride]; + const scalar_t* self_batch = &self_data[b * self_stride]; scalar_t* result_batch = &result_data[b * result_stride]; apply_triu_tril_single( result_batch, diff --git a/aten/src/ATen/native/UnaryOps.cpp b/aten/src/ATen/native/UnaryOps.cpp index c978ffcc2d89a..6c22d2583f130 100644 --- a/aten/src/ATen/native/UnaryOps.cpp +++ b/aten/src/ATen/native/UnaryOps.cpp @@ -412,7 +412,6 @@ template static inline Tensor& unary_op_impl_float_out(Tensor& result, const Tensor& self, Stub& stub, Args... args) { auto iter = TensorIterator::unary_float_op(result, self); stub(iter.device_type(), iter, args...); - iter.cast_outputs(); return result; } @@ -868,7 +867,7 @@ Tensor& logical_not_out(const Tensor& self, Tensor& result) { TensorIterator iter = TensorIteratorConfig() .check_all_same_dtype(false) .add_output(result) - .add_input(self) + .add_const_input(self) .build(); logical_not_stub(iter.device_type(), iter); return result; @@ -964,7 +963,7 @@ std::tuple frexp_out(const Tensor& self, auto iter = TensorIteratorConfig() .add_output(mantissa) .add_output(exponent) - .add_input(self) + .add_const_input(self) .check_all_same_dtype(false) .set_check_mem_overlap(true) .build(); @@ -973,7 +972,7 @@ std::tuple frexp_out(const Tensor& self, return std::tuple(mantissa, exponent); } -// alias for lgamma, implements special.gammanln equivalent to +// alias for lgamma, implements special.gammaln equivalent to // scipy.special.gammaln Tensor special_gammaln(const Tensor& self) { return self.lgamma(); } Tensor& special_gammaln_out(const Tensor& self, Tensor& result) { return at::lgamma_out(result, self); } diff --git a/aten/src/ATen/native/Unfold2d.h b/aten/src/ATen/native/Unfold2d.h index 98d628f7bf2ca..e5fe7d4468217 100644 --- a/aten/src/ATen/native/Unfold2d.h +++ b/aten/src/ATen/native/Unfold2d.h @@ -6,7 +6,25 @@ namespace at::native { -using unfold2d_fn = void (*)( +using unfold2d_copy_fn = void (*)( + ScalarType dtype, + void *finput, + const void *input, + int64_t kH, + int64_t kW, + int64_t dH, + int64_t dW, + int64_t padH, + int64_t padW, + int64_t n_input_plane, + int64_t input_height, + int64_t input_width, + int64_t output_height, + int64_t output_width, + bool is_channels_last +); + +using unfold2d_acc_fn = void (*)( ScalarType dtype, void *finput, void *input, @@ -24,7 +42,7 @@ using unfold2d_fn = void (*)( bool is_channels_last ); -DECLARE_DISPATCH(unfold2d_fn, unfolded2d_copy_stub); -DECLARE_DISPATCH(unfold2d_fn, unfolded2d_acc_stub); +DECLARE_DISPATCH(unfold2d_copy_fn, unfolded2d_copy_stub); +DECLARE_DISPATCH(unfold2d_acc_fn, unfolded2d_acc_stub); } // namespace at::native diff --git a/aten/src/ATen/native/UnfoldBackward.h b/aten/src/ATen/native/UnfoldBackward.h index 7ff39f84c6fdd..44e05c125913e 100644 --- a/aten/src/ATen/native/UnfoldBackward.h +++ b/aten/src/ATen/native/UnfoldBackward.h @@ -100,8 +100,8 @@ static C10_UNUSED TensorIterator _make_unfold_backward_iter_over_grad_out( .check_all_same_dtype(false) .resize_outputs(false) .add_owned_output(grad_out_restrided) - .add_owned_input(grad_in_restrided) - .add_owned_input(idx_dim_restrided) + .add_owned_const_input(grad_in_restrided) + .add_owned_const_input(idx_dim_restrided) .build(); return iter; diff --git a/aten/src/ATen/native/Unique.cpp b/aten/src/ATen/native/Unique.cpp index be220fc40ec7e..801af5d5e79fe 100644 --- a/aten/src/ATen/native/Unique.cpp +++ b/aten/src/ATen/native/Unique.cpp @@ -2,7 +2,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include #include @@ -37,7 +37,7 @@ std::tuple unique_cpu_bool_template( const bool return_inverse, const bool return_counts) { const Tensor& input = self.contiguous(); - bool* input_data = input.data_ptr(); + const bool* input_data = input.const_data_ptr(); int64_t numel = input.numel(); Tensor output = at::empty({0}, self.options()); @@ -270,7 +270,7 @@ std::tuple unique_consecutive_cpu_template( const bool return_inverse, const bool return_counts) { const Tensor& input = self.contiguous(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); int64_t numel = input.numel(); Tensor output = at::empty({numel}, input.options()); Tensor inverse_indices = at::empty({0}, self.options().dtype(kLong)); @@ -390,7 +390,7 @@ std::tuple _unique_dim_cpu_template( std::vector indices(input_flat.size(0)); std::iota(indices.begin(), indices.end(), 0); int64_t numel = input_flat.size(1); - scalar_t* input_flat_ptr = ((scalar_t*)input_flat.data_ptr()); + const scalar_t* input_flat_ptr = ((const scalar_t*)input_flat.const_data_ptr()); // sort indices using data if (!consecutive) { @@ -442,19 +442,17 @@ std::tuple _unique_dim_cpu_template( std::tuple _unique_cpu(const Tensor& self, const bool sorted, const bool return_inverse) { if (self.scalar_type() == kBool) { - Tensor output, inverse; - std::tie(output, inverse, std::ignore) = unique_cpu_bool_template( + auto [output, inverse, _] = unique_cpu_bool_template( self, return_inverse, /* return_counts */false); return std::make_tuple(output, inverse); } - return AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "unique", [&] { - Tensor output, inverse; + return AT_DISPATCH_V2(self.scalar_type(), "unique", [&] AT_WRAP({ // The current CPU implementation of unique always sort due to // this is faster than hash table - std::tie(output, inverse, std::ignore) = unique_cpu_sorted_template( + auto [output, inverse, _] = unique_cpu_sorted_template( self, return_inverse, /* return_counts */false, IsUnique()); return std::make_tuple(output, inverse); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple @@ -462,35 +460,35 @@ _unique2_cpu(const Tensor& self, const bool sorted, const bool return_inverse, c if (self.scalar_type() == kBool) { return unique_cpu_bool_template(self, return_inverse, return_counts); } - return AT_DISPATCH_ALL_TYPES_AND2(kBFloat16, kHalf, self.scalar_type(), "unique", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { // The current CPU implementation of unique always sort due to // this is faster than hash table return unique_cpu_sorted_template( self, return_inverse, return_counts, IsUnique()); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_dim_cpu(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) { - return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] { // The current implementation using `dim` always sorts due to unhashable tensors return _unique_dim_cpu_template(self, dim, false, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_dim_consecutive_cpu(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) { - return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique_dim", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] { return _unique_dim_cpu_template(self, dim, true, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_consecutive_cpu(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim) { if (!dim.has_value() || (dim.value() == 0 && self.dim() == 1)) { - return AT_DISPATCH_ALL_TYPES_AND3(kBFloat16, kBool, kHalf, self.scalar_type(), "unique", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { return unique_consecutive_cpu_template(self, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBFloat16, kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } return unique_dim_consecutive_cpu(self, dim.value(), return_inverse, return_counts); } diff --git a/aten/src/ATen/native/UpSample.h b/aten/src/ATen/native/UpSample.h index 95797cb538284..8dadc7cee3ae4 100644 --- a/aten/src/ATen/native/UpSample.h +++ b/aten/src/ATen/native/UpSample.h @@ -4,9 +4,12 @@ #include #include +#include #include +#include #include #include +#include /** * Note [compute_scales_value] @@ -467,30 +470,32 @@ static inline void compute_source_index_and_lambda( } } -// It will not be used by data types other than BFloat16. -template +// It will not be used by data types other than BFloat16 and Half. +template || !std::is_same::value, int> = 0> void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { - TORCH_CHECK((std::is_same::value), - "Upsample backward only support BFloat16 in the lower percision data types on CPU.") + TORCH_CHECK((is_reduced_floating_point_v), + "Upsample backward only support BFloat16 and Half in the lower precision data types on CPU.") TORCH_CHECK((std::is_same::value), - "Upsample backward should use float as acc buffer for BFloat16 grad input on CPU.") + "Upsample backward should use float as acc buffer for BFloat16 and Half grad input on CPU.") return; } -template <> -void inline apply_grad_input(float* buffer_ptr, BFloat16* gin, int64_t size) { - using bVec = vec::Vectorized; - using fVec = vec::Vectorized; +template && std::is_same::value, int> = 0> +void inline apply_grad_input(scalar_in* buffer_ptr, scalar_out* gin, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec gin_bvec = bVec::loadu(gin + d); fVec gin_fvec0, gin_fvec1; - std::tie(gin_fvec0, gin_fvec1) = convert_bfloat16_float(gin_bvec); + std::tie(gin_fvec0, gin_fvec1) = convert_to_float(gin_bvec); gin_fvec0 += fVec::loadu(buffer_ptr + d); gin_fvec1 += fVec::loadu(buffer_ptr + d + fVec::size()); fVec(0).store(buffer_ptr + d); fVec(0).store(buffer_ptr + d + fVec::size()); - convert_float_bfloat16(gin_fvec0, gin_fvec1).store(gin + d); + convert_from_float(gin_fvec0, gin_fvec1).store(gin + d); } for (; d < size; d++) { gin[d] += buffer_ptr[d]; diff --git a/aten/src/ATen/native/UpSampleBicubic2d.cpp b/aten/src/ATen/native/UpSampleBicubic2d.cpp index d08c8d3a48a67..f5e523c4a9114 100644 --- a/aten/src/ATen/native/UpSampleBicubic2d.cpp +++ b/aten/src/ATen/native/UpSampleBicubic2d.cpp @@ -106,7 +106,7 @@ namespace { template static void upsample_bicubic2d_backward_out_frame( - scalar_t* odata, + const scalar_t* odata, scalar_t* idata, int64_t input_height, int64_t input_width, @@ -136,7 +136,7 @@ static void upsample_bicubic2d_backward_out_frame( } for (const auto i : c10::irange(start, end)) { scalar_t* in = idata + i * input_slice_size; - scalar_t* out = odata + i * output_slice_size; + const scalar_t* out = odata + i * output_slice_size; for (const auto output_y : c10::irange(output_height)) { for (const auto output_x : c10::irange(output_width)) { @@ -205,7 +205,7 @@ static void upsample_bicubic2d_backward_kernel( AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bicubic2d_backward", [&] { scalar_t* idata = grad_input.mutable_data_ptr(); - scalar_t* odata = grad_output.data_ptr(); + const scalar_t* odata = grad_output.const_data_ptr(); upsample_bicubic2d_backward_out_frame( odata, diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp index afdece7f0d491..aa2bab7c6b945 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_deserialize.cpp @@ -11,18 +11,18 @@ namespace ao { namespace sparse { namespace { -const int64_t serialization_version_index = 0; -const int64_t bias_index = 1; -const int64_t out_features_block_size_index = 2; -const int64_t in_features_block_size_index = 3; -const int64_t weight_scales_index = 4; -const int64_t weight_zero_point_index = 5; -const int64_t quantization_scheme_index = 6; -const int64_t row_block_indices_index = 7; -const int64_t col_block_indices_index = 8; -const int64_t weight_values_index = 9; -const int64_t num_output_channels_index = 10; -const int64_t num_input_channels_index = 11; +constexpr int64_t serialization_version_index = 0; +constexpr int64_t bias_index = 1; +constexpr int64_t out_features_block_size_index = 2; +constexpr int64_t in_features_block_size_index = 3; +constexpr int64_t weight_scales_index = 4; +constexpr int64_t weight_zero_point_index = 5; +constexpr int64_t quantization_scheme_index = 6; +constexpr int64_t row_block_indices_index = 7; +constexpr int64_t col_block_indices_index = 8; +constexpr int64_t weight_values_index = 9; +constexpr int64_t num_output_channels_index = 10; +constexpr int64_t num_input_channels_index = 11; template std::vector unwrap_vector(at::Tensor tensor) { diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp index bedf2f4461f3a..8f80d920e3652 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_prepack.cpp @@ -186,7 +186,7 @@ PackedLinearWeightQnnp::PackedLinearWeightQnnp( std::tie(w_zero_points_, w_scales_) = make_zero_points_and_scales_tensor(weight_contig); - const float* weight_scales_data = w_scales_.data_ptr(); + const float* weight_scales_data = w_scales_.const_data_ptr(); at::Tensor qnnp_weight = at::_empty_affine_quantized( weight_contig.sizes(), at::device(c10::kCPU).dtype(c10::kQUInt8), diff --git a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp index e557ec3994134..d5790b5bc223e 100644 --- a/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp +++ b/aten/src/ATen/native/ao_sparse/quantized/cpu/qlinear_serialize.cpp @@ -160,7 +160,7 @@ BCSRSerializationType PackedLinearWeight::serialize() { BCSRSerializationType PackedLinearWeightQnnp::serialize() { at::Tensor w_scales_compact; at::Tensor w_zero_points_compact; - const float* w_scales_data_ptr = w_scales_.data_ptr(); + const float* w_scales_data_ptr = w_scales_.const_data_ptr(); std::function subtract_128 = [](uint8_t v) { return static_cast(static_cast(v) - 128); }; diff --git a/aten/src/ATen/native/batch_norm.h b/aten/src/ATen/native/batch_norm.h index cbddde86ad8ba..eba4b0a963241 100644 --- a/aten/src/ATen/native/batch_norm.h +++ b/aten/src/ATen/native/batch_norm.h @@ -26,8 +26,13 @@ static TensorAccessor conditional_accessor_1d(const Tensor& t) { template static scalar_t* conditional_data_ptr(const Tensor& t) { - return t.defined() ? t.contiguous().data_ptr() - : nullptr; + if constexpr (std::is_const_v) { + return t.defined() ? t.contiguous().const_data_ptr() + : nullptr; + } else { + return t.defined() ? t.contiguous().data_ptr() + : nullptr; + } } } // namespace at::native diff --git a/aten/src/ATen/native/cpu/Activation.cpp b/aten/src/ATen/native/cpu/Activation.cpp index a7349d8299ca6..88b43015d9906 100644 --- a/aten/src/ATen/native/cpu/Activation.cpp +++ b/aten/src/ATen/native/cpu/Activation.cpp @@ -30,14 +30,13 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const using Vec = Vectorized; scalar_t* output_data = output.data_ptr(); scalar_t* buffer_data = buffer.data_ptr(); - scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) { int64_t size = end - begin; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec data_vec = Vec::loadu(input_data + begin+ d); - Vectorized data_vec0, data_vec1; - std::tie(data_vec0, data_vec1) = convert_to_float(data_vec); + auto [data_vec0, data_vec1] = convert_to_float(data_vec); Vectorized min_vec = minimum(data_vec0, Vectorized(float(0))); Vectorized buffer_vec0 = data_vec0.abs().neg().exp(); Vectorized output_vec0 = min_vec - buffer_vec0.log1p(); @@ -49,8 +48,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const } if (size - d > 0) { Vec data_vec = Vec::loadu(input_data + begin + d, size - d); - Vectorized data_vec0, data_vec1; - std::tie(data_vec0, data_vec1) = convert_to_float(data_vec); + auto [data_vec0, data_vec1] = convert_to_float(data_vec); Vectorized min_vec = minimum(data_vec0, Vectorized(float(0))); Vectorized buffer_vec0 = data_vec0.abs().neg().exp(); Vectorized output_vec0 = min_vec - buffer_vec0.log1p(); @@ -67,7 +65,7 @@ static void log_sigmoid_cpu_kernel(TensorBase &output, TensorBase &buffer, const using Vec = Vectorized; scalar_t* output_data = output.data_ptr(); scalar_t* buffer_data = buffer.data_ptr(); - scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); parallel_for(0, input.numel(), 1, [&] (int64_t begin, int64_t end) { int64_t size = end - begin; int64_t d = 0; @@ -108,10 +106,9 @@ static void log_sigmoid_backward_cpu_kernel(TensorIterator& iter) { return (max_deriv - sign * (float(b) / (float(1) + b))) * float(c); }, [=](Vec a, Vec b, Vec c) -> Vec { - Vectorized a0, a1, b0, b1, c0, c1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); - std::tie(c0, c1) = convert_to_float(c); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); + auto [c0, c1] = convert_to_float(c); auto mask = a0 < zero_vec; auto max_deriv_vec = Vectorized::blendv(zero_vec, one_vec, mask); auto sign_vec = Vectorized::blendv(one_vec.neg(), one_vec, mask); @@ -164,9 +161,8 @@ static void threshold_kernel( return float(x) <= threshold ? value : other; }, [&](Vectorized x, Vectorized other) -> Vectorized { - Vec x0, x1, other0, other1; - std::tie(x0, x1) = convert_to_float(x); - std::tie(other0, other1) = convert_to_float(other); + auto [x0, x1] = convert_to_float(x); + auto [other0, other1] = convert_to_float(other); return convert_from_float(Vec::blendv(other0, value_v, x0 <= threshold_v), Vec::blendv(other1, value_v, x1 <= threshold_v)); }); @@ -207,16 +203,15 @@ void elu_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scalar& scale return float(a) <= float(0) ? (std::exp(float(a) * negiptcoef) - float(1)) * negcoef : float(a) * poscoef; }, [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &one_vec, &zero_vec](Vectorized a) -> Vectorized { - Vectorized a0, a1, res0, res1; - std::tie(a0, a1) = convert_to_float(a); + auto [a0, a1] = convert_to_float(a); auto cmp0 = (a0 > zero_vec); auto cmp1 = (a1 > zero_vec); auto get_res_masked = [&](Vectorized& cmp, Vectorized& a) { return !cmp.zero_mask() ? a * poscoef_vec : Vectorized::blendv(((a * negiptcoef_vec).exp() - one_vec) * negcoef_vec, a * poscoef_vec, cmp); }; - res0 = get_res_masked(cmp0, a0); - res1 = get_res_masked(cmp1, a1); + auto res0 = get_res_masked(cmp0, a0); + auto res1 = get_res_masked(cmp1, a1); return convert_from_float(res0, res1); }); }); @@ -268,10 +263,8 @@ void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scal } }, [&negcoef_vec, &negiptcoef_vec, &poscoef_vec, &zero_vec, is_result](Vectorized a, Vectorized b) -> Vectorized { - Vectorized a0, a1, res0, res1; - std::tie(a0, a1) = convert_to_float(a); - Vectorized b0, b1; - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); auto cmp0 = (b0 > zero_vec); auto cmp1 = (b1 > zero_vec); auto get_res_masked = [&](Vectorized& cmp, Vectorized& a, Vectorized& b) { @@ -282,8 +275,8 @@ void elu_backward_kernel(TensorIteratorBase& it, const Scalar& alpha, const Scal return Vectorized::blendv(a * negiptcoef_vec * negcoef_vec * (b * negiptcoef_vec).exp(), a * poscoef_vec, cmp); } }; - res0 = get_res_masked(cmp0, a0, b0); - res1 = get_res_masked(cmp1, a1, b1); + auto res0 = get_res_masked(cmp0, a0, b0); + auto res1 = get_res_masked(cmp1, a1, b1); return convert_from_float(res0, res1); }); }); @@ -364,8 +357,7 @@ void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) { return float(0.5) * float(x) * (float(1) + std::tanh(inner)); }, [&](Vectorized x) -> Vectorized { - Vectorized x0, x1; - std::tie(x0, x1) = convert_to_float(x); + auto [x0, x1] = convert_to_float(x); auto x0_cube = x0 * x0 * x0; auto x1_cube = x1 * x1 * x1; auto inner_vec0 = kBetaVec * (x0 + kKappaVec * x0_cube); @@ -414,8 +406,7 @@ void GeluKernelImpl(TensorIteratorBase& it, GeluType approximate) { return float(x) * float(0.5) * (float(1) + std::erf(float(x) * kAlpha)); }, [&](Vectorized x) -> Vectorized { - Vectorized x0, x1; - std::tie(x0, x1) = convert_to_float(x); + auto [x0, x1] = convert_to_float(x); auto res0 = x0 * kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf()); auto res1 = x1 * kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf()); return convert_from_float(res0, res1); @@ -477,10 +468,8 @@ void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) { return float(dy) * (left_derivative + right_derivative); }, [&](Vectorized dy_vec, Vectorized x_vec) -> Vectorized { - Vectorized x0_vec, x1_vec; - std::tie(x0_vec, x1_vec) = convert_to_float(x_vec); - Vectorized dy0_vec, dy1_vec; - std::tie(dy0_vec, dy1_vec) = convert_to_float(dy_vec); + auto [x0_vec, x1_vec] = convert_to_float(x_vec); + auto [dy0_vec, dy1_vec] = convert_to_float(dy_vec); auto x0_sq = x0_vec * x0_vec; auto x1_sq = x1_vec * x1_vec; auto x0_cube = x0_vec * x0_vec * x0_vec; @@ -583,10 +572,8 @@ void GeluBackwardKernelImpl(TensorIteratorBase& it, GeluType approximate) { return float(dy) * (cdf + float(x) * pdf); }, [&](Vectorized dy, Vectorized x) -> Vectorized { - Vectorized x0, x1; - std::tie(x0, x1) = convert_to_float(x); - Vectorized dy0, dy1; - std::tie(dy0, dy1) = convert_to_float(dy); + auto [x0, x1] = convert_to_float(x); + auto [dy0, dy1] = convert_to_float(dy); auto cdf_vec0 = kPointFiveVec * (kOneVec + (x0 * kAlphaVec).erf()); auto cdf_vec1 = kPointFiveVec * (kOneVec + (x1 * kAlphaVec).erf()); auto pdf_vec0 = kBetaVec * (x0 * x0 * kMinusPointFiveVec).exp(); @@ -643,8 +630,7 @@ void hardsigmoid_kernel(TensorIteratorBase& iter) { return std::min(std::max(float(self_val) + three, zero), six) / six; }, [&](vec::Vectorized self_val) -> vec::Vectorized { - Vectorized self_val0, self_val1; - std::tie(self_val0, self_val1) = convert_to_float(self_val); + auto [self_val0, self_val1] = convert_to_float(self_val); self_val0 = minimum( maximum(self_val0 + kThreeVec, kZeroVec), kSixVec @@ -698,9 +684,8 @@ void hardsigmoid_backward_kernel(TensorIteratorBase& iter) { : zero; }, [=](Vectorized grad_val, Vectorized self_val) -> Vectorized { - Vec self_val0, self_val1, grad_val0, grad_val1; - std::tie(self_val0, self_val1) = convert_to_float(self_val); - std::tie(grad_val0, grad_val1) = convert_to_float(grad_val); + auto [self_val0, self_val1] = convert_to_float(self_val); + auto [grad_val0, grad_val1] = convert_to_float(grad_val); Vec gradNonZeroMask = (self_val0 > neg_three) & (self_val0 < three); self_val0 = Vec::blendv(kZeroVec, grad_val0 * kOneSixthVec, gradNonZeroMask); gradNonZeroMask = (self_val1 > neg_three) & (self_val1 < three); @@ -759,11 +744,9 @@ void softshrink_kernel(TensorIteratorBase& iter, const Scalar& lambd) { return float(a) > lambd_val ? a - lambd_val : (float(a) < -lambd_val ? a + lambd_val : float(0)); }, [=](Vectorized self_val) -> Vectorized { - Vectorized self_val0, self_val1; - Vectorized self_val_t0, self_val_t1; - std::tie(self_val0, self_val1) = convert_to_float(self_val); - self_val_t0 = convert_from_float((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec)); - self_val_t1 = convert_from_float((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec)); + auto [self_val0, self_val1] = convert_to_float(self_val); + auto self_val_t0 = convert_from_float((self_val0 > lambdVec) & (self_val0 - lambdVec), (self_val1 > lambdVec) & (self_val1 - lambdVec)); + auto self_val_t1 = convert_from_float((self_val0 < -lambd_val) & (self_val0 + lambdVec), (self_val1 < -lambd_val) & (self_val1 + lambdVec)); return (self_val_t0 | self_val_t1); }); }); @@ -812,9 +795,8 @@ void hardtanh_backward_kernel(TensorIterator& iter, const Scalar& min, const Sca return (float(self_val) <= min_val || float(self_val) >= max_val) ? scalar_t(0) : grad_val; }, [=](Vectorized grad_val, Vectorized self_val) -> Vectorized { - Vectorized grad_val0, grad_val1, self_val0, self_val1; - std::tie(grad_val0, grad_val1) = convert_to_float(grad_val); - std::tie(self_val0, self_val1) = convert_to_float(self_val); + auto [grad_val0, grad_val1] = convert_to_float(grad_val); + auto [self_val0, self_val1] = convert_to_float(self_val); return convert_from_float( ((self_val0 > min_val) & (self_val0 < max_val)) & grad_val0, ((self_val1 > min_val) & (self_val1 < max_val)) & grad_val1 @@ -853,8 +835,7 @@ void hardswish_kernel(TensorIterator& iter) { return float(x) * std::min(std::max(float(x) + three, zero), six) / six; }, [&](vec::Vectorized x_vec) { - Vectorized x_vec0, x_vec1; - std::tie(x_vec0, x_vec1) = convert_to_float(x_vec); + auto [x_vec0, x_vec1] = convert_to_float(x_vec); x_vec0 = x_vec0 * minimum( maximum(x_vec0 + kThreeVec, kZeroVec), kSixVec @@ -915,9 +896,8 @@ void hardswish_backward_kernel(TensorIterator& iter) { } }, [&](vec::Vectorized grad_val, vec::Vectorized self_val) { - Vectorized self_val0, self_val1, grad_val0, grad_val1; - std::tie(self_val0, self_val1) = convert_to_float(self_val); - std::tie(grad_val0, grad_val1) = convert_to_float(grad_val); + auto [self_val0, self_val1] = convert_to_float(self_val); + auto [grad_val0, grad_val1] = convert_to_float(grad_val); self_val0 = Vec::blendv( Vec::blendv( grad_val0 * ((self_val0 / kThreeVec) + kOneHalfVec), @@ -990,8 +970,7 @@ static void leaky_relu_kernel(TensorIteratorBase& iter, const Scalar& negval_) { return float(a) > float(0) ? float(a) : float(a) * negval; }, [&](Vectorized a) -> Vectorized { - Vectorized a0, a1; - std::tie(a0, a1) = convert_to_float(a); + auto [a0, a1] = convert_to_float(a); auto res0 = a0 * (Vectorized::blendv(negval_v, one_vec, a0 > zero_vec)); auto res1 = a1 * (Vectorized::blendv(negval_v, one_vec, a1 > zero_vec)); return convert_from_float(res0, res1); @@ -1030,9 +1009,8 @@ static void leaky_relu_backward_kernel(TensorIteratorBase& iter, const Scalar& n return float(a) > float(0) ? float(b) : float(b) * negval; }, [&](Vectorized a, Vectorized b) -> Vectorized { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); auto res0 = b0 * (Vectorized::blendv(negval_v, one_vec, a0 > zero_vec)); auto res1 = b1 * (Vectorized::blendv(negval_v, one_vec, a1 > zero_vec)); return convert_from_float(res0, res1); @@ -1073,8 +1051,7 @@ void softplus_kernel(TensorIteratorBase& iter, const Scalar& beta_, const Scalar : static_cast((std::log1p(std::exp(float(a) * beta))) / beta); }, [beta_vec, threshold_vec](Vectorized a) -> Vectorized { - Vectorized a0, a1; - std::tie(a0, a1) = convert_to_float(a); + auto [a0, a1] = convert_to_float(a); a0 = Vec::blendv((a0 * beta_vec).exp().log1p() / beta_vec, a0, (a0 * beta_vec) > threshold_vec); a1 = Vec::blendv((a1 * beta_vec).exp().log1p() / beta_vec, a1, (a1 * beta_vec) > threshold_vec); return convert_from_float(a0, a1); @@ -1118,9 +1095,8 @@ void softplus_backward_kernel(TensorIteratorBase& iter, const Scalar& beta_, con return (float(b) * beta) > threshold ? a : static_cast(float(a) * z / (z + float(1.))); }, [beta_vec, one_vec, threshold_vec](Vectorized a, Vectorized b) -> Vectorized { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); Vec z = (b0 * beta_vec).exp(); a0 = Vec::blendv(a0 * z / (z + one_vec), a0, (b0 * beta_vec) > threshold_vec); z = (b1 * beta_vec).exp(); @@ -1162,9 +1138,8 @@ void glu_kernel(TensorIteratorBase& iter) { return float(a) * (float_one_val / (float_one_val + std::exp(- float(b)))); }, [float_one_vec](Vectorized a, Vectorized b) -> Vectorized { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); return convert_from_float(a0 * (float_one_vec / (float_one_vec + b0.neg().exp())), a1 * (float_one_vec / (float_one_vec + b1.neg().exp()))); }); @@ -1217,10 +1192,9 @@ void glu_backward_kernel(TensorIterator& iter) { return (float_one_val - float(a)) * float(a) * float(b) * float(c); }, [float_one_vec](Vectorized a, Vectorized b, Vectorized c) -> Vectorized { - Vectorized a0, a1, b0, b1, c0, c1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); - std::tie(c0, c1) = convert_to_float(c); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); + auto [c0, c1] = convert_to_float(c); a0 = (float_one_vec - a0) * a0 * b0 * c0; a1 = (float_one_vec - a1) * a1 * b1 * c1; return convert_from_float(a0, a1); @@ -1254,8 +1228,7 @@ void silu_kernel(TensorIteratorBase& iter) { return float(x) / (1.0f + std::exp(-float(x))); }, [kOneVec](Vectorized x_vec) -> Vectorized { - Vectorized x_vec0, x_vec1; - std::tie(x_vec0, x_vec1) = convert_to_float(x_vec); + auto [x_vec0, x_vec1] = convert_to_float(x_vec); return convert_from_float( x_vec0 / (kOneVec + x_vec0.neg().exp()), x_vec1 / (kOneVec + x_vec1.neg().exp())); @@ -1289,9 +1262,8 @@ void silu_backward_kernel(TensorIteratorBase& iter) { return dy * sigmoid * (1.0f + x * (1.0f - sigmoid)); }, [kOneVec](Vectorized dy_vec, Vectorized x_vec) -> Vectorized { - Vectorized x_vec0, x_vec1, dy_vec0, dy_vec1; - std::tie(x_vec0, x_vec1) = convert_to_float(x_vec); - std::tie(dy_vec0, dy_vec1) = convert_to_float(dy_vec); + auto [x_vec0, x_vec1] = convert_to_float(x_vec); + auto [dy_vec0, dy_vec1] = convert_to_float(dy_vec); const Vectorized sigmoid0 = kOneVec / (kOneVec + x_vec0.neg().exp()); const Vectorized sigmoid1 = @@ -1330,8 +1302,7 @@ void mish_kernel(TensorIteratorBase& iter) { return static_cast(float(x) * std::tanh(std::log1p(std::exp(float(x))))); }, [](Vectorized x_vec) -> Vectorized { - Vectorized x_vec0, x_vec1; - std::tie(x_vec0, x_vec1) = convert_to_float(x_vec); + auto [x_vec0, x_vec1] = convert_to_float(x_vec); return convert_from_float( x_vec0 * x_vec0.exp().log1p().tanh(), x_vec1 * x_vec1.exp().log1p().tanh() @@ -1367,9 +1338,8 @@ void mish_backward_kernel(TensorIterator& iter) { return dy * (tanh_softplus + x * sigmoid * (1.0f - tanh_softplus * tanh_softplus)); }, [kOneVec](Vectorized dy_vec, Vectorized x_vec) -> Vectorized { - Vectorized x_vec0, x_vec1, dy_vec0, dy_vec1; - std::tie(x_vec0, x_vec1) = convert_to_float(x_vec); - std::tie(dy_vec0, dy_vec1) = convert_to_float(dy_vec); + auto [x_vec0, x_vec1] = convert_to_float(x_vec); + auto [dy_vec0, dy_vec1] = convert_to_float(dy_vec); const Vec sigmoid0 = kOneVec / (kOneVec + x_vec0.neg().exp()); const Vec sigmoid1 = kOneVec / (kOneVec + x_vec1.neg().exp()); const Vec tanh_softplus0 = x_vec0.exp().log1p().tanh(); diff --git a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp index b6ba000954d50..6f96d495f85c4 100644 --- a/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveAvgPoolKernel.cpp @@ -15,14 +15,14 @@ namespace at::native { namespace { template -void cpu_adaptive_avg_pool( +void cpu_adaptive_avg_pool2d( Tensor& output_, const Tensor& input_, IntArrayRef output_size) { auto input = input_.contiguous(); auto output = output_.contiguous(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t ndim = input.ndimension(); @@ -36,7 +36,7 @@ void cpu_adaptive_avg_pool( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { - scalar_t* input_ptr = input_data + c * input_height * input_width; + const scalar_t* input_ptr = input_data + c * input_height * input_width; scalar_t* output_ptr = output_data + c * output_height * output_width; for (const auto oh : c10::irange(output_height)) { @@ -69,7 +69,7 @@ void cpu_adaptive_avg_pool( template typename std::enable_if_t>, void> -cpu_adaptive_avg_pool_channels_last( +cpu_adaptive_avg_pool2d_channels_last( Tensor& output_, const Tensor& input_, IntArrayRef output_size) { @@ -77,7 +77,7 @@ cpu_adaptive_avg_pool_channels_last( auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -107,7 +107,7 @@ cpu_adaptive_avg_pool_channels_last( scalar_t* out = output_data + i * channels; int64_t size = channels; - // Note: For oridinary usage scenario, each out lane should + // Note: For ordinary usage scenario, each out lane should // fit in L1 cache; otherwise consider block dim C. // Pass I: zero the out lane int64_t d1 = 0; @@ -121,7 +121,7 @@ cpu_adaptive_avg_pool_channels_last( // Pass II: compute local sum for (const auto ih : c10::irange(ih0, ih1)) { for (const auto iw : c10::irange(iw0, iw1)) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -156,7 +156,7 @@ cpu_adaptive_avg_pool_channels_last( template typename std::enable_if_t>, void> -cpu_adaptive_avg_pool_channels_last( +cpu_adaptive_avg_pool2d_channels_last( Tensor& output_, const Tensor& input_, IntArrayRef output_size) { @@ -164,7 +164,7 @@ cpu_adaptive_avg_pool_channels_last( auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -212,7 +212,7 @@ cpu_adaptive_avg_pool_channels_last( // Pass II: compute local sum for (const auto ih : c10::irange(ih0, ih1)) { for (const auto iw : c10::irange(iw0, iw1)) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -255,13 +255,13 @@ cpu_adaptive_avg_pool_channels_last( } template -void cpu_adaptive_avg_pool_backward( +void cpu_adaptive_avg_pool2d_backward( Tensor& grad_input_, const Tensor& grad_output_) { auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); int64_t ndim = grad_output.ndimension(); @@ -276,7 +276,7 @@ void cpu_adaptive_avg_pool_backward( at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; - scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); @@ -305,7 +305,7 @@ void cpu_adaptive_avg_pool_backward( } template -void cpu_adaptive_avg_pool_backward_channels_last( +void cpu_adaptive_avg_pool2d_backward_channels_last( Tensor& grad_input_, const Tensor& grad_output_) { auto memory_format = at::MemoryFormat::ChannelsLast; @@ -313,7 +313,7 @@ void cpu_adaptive_avg_pool_backward_channels_last( auto grad_output = grad_output_.contiguous(memory_format); auto grad_input_data = grad_input.mutable_data_ptr(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); int64_t nbatch = grad_input.size(0); int64_t channels = grad_input.size(1); @@ -327,7 +327,7 @@ void cpu_adaptive_avg_pool_backward_channels_last( at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; - scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; + const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; for (const auto oh : c10::irange(output_height)) { int64_t ih0 = start_index(oh, output_height, input_height); @@ -339,7 +339,7 @@ void cpu_adaptive_avg_pool_backward_channels_last( int64_t iw1 = end_index(ow, output_width, input_width); int64_t kw = iw1 - iw0; - scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; + const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t size = channels; for (const auto ih : c10::irange(ih0, ih1)) { for (const auto iw : c10::irange(iw0, iw1)) { @@ -373,13 +373,13 @@ void adaptive_avg_pool2d_kernel_impl( case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d", [&] { using param_t = at::opmath_type; - cpu_adaptive_avg_pool(output, input, output_size); + cpu_adaptive_avg_pool2d(output, input, output_size); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool2d_channels_last", [&]{ - cpu_adaptive_avg_pool_channels_last(output, input, output_size); + cpu_adaptive_avg_pool2d_channels_last(output, input, output_size); }); break; } @@ -394,13 +394,458 @@ void adapative_avg_pool2d_backward_kernel_impl( switch (grad_output.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward", [&] { - cpu_adaptive_avg_pool_backward(grad_input, grad_output); + cpu_adaptive_avg_pool2d_backward(grad_input, grad_output); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool2d_backward_channels_last", [&]{ - cpu_adaptive_avg_pool_backward_channels_last(grad_input, grad_output); + cpu_adaptive_avg_pool2d_backward_channels_last(grad_input, grad_output); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + + +template +void cpu_adaptive_avg_pool3d( + Tensor& output_, + const Tensor& input_, + IntArrayRef output_size) { + auto input = input_.contiguous(); + auto output = output_.contiguous(); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t ndim = input.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1); + int64_t input_depth = input.size(-3); + int64_t input_height = input.size(-2); + int64_t input_width = input.size(-1); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width; + scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width; + + for (const auto od : c10::irange(output_depth)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + int64_t kd = id1 - id0; + + for (const auto oh : c10::irange(output_height)) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (const auto ow : c10::irange(output_width)) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + // compute local average + accscalar_t sum = 0; + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + sum += accscalar_t(input_ptr[id * input_height * input_width + ih * input_width + iw]); + } + } + } + output_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(sum / kd / kh / kw); + } + } + } + } + }); + + if (!output_.is_contiguous()) { + output_.copy_(output); + } +} + + +template +typename std::enable_if_t>, void> +cpu_adaptive_avg_pool3d_channels_last( + Tensor& output_, + const Tensor& input_, + IntArrayRef output_size) { + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + using Vec = vec::Vectorized; + // parallel on dim N, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + for (const auto i : c10::irange(begin, end)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + int64_t kd = id1 - id0; + + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t* out = output_data + i * channels; + int64_t size = channels; + + // Note: For oridinary usage scenario, each out lane should + // fit in L1 cache; otherwise consider block dim C. + // Pass I: zero the out lane + int64_t d1 = 0; + for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { + Vec out_vec = Vec(scalar_t(0)); + out_vec.store(out + d1); + } + for (; d1 < size; d1++) { + out[d1] = scalar_t(0); + } + // Pass II: compute local sum + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + scalar_t* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < size - (size % Vec::size()); d2 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2); + out_vec.store(out + d2); + } + for (; d2 < size; d2++) { + out[d2] += in[d2]; + } + } + } + } + // Pass III: compute local average + int64_t d3 = 0; + for (; d3 < size - (size % Vec::size()); d3 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(kd * kh * kw)); + out_vec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = out[d3] / kd / kh / kw; + } + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template +typename std::enable_if_t>, void> +cpu_adaptive_avg_pool3d_channels_last( + Tensor& output_, + const Tensor& input_, + IntArrayRef output_size) { + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + // parallel on dim N,D, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t oh = 0; + int64_t ow = 0; + int64_t od = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + // temp buffer for sum, use float as accumulation type + // can't reuse output buffer to store sum since it is BFloat16/Half + auto sum_arr = std::make_unique(channels); + float* sum = sum_arr.get(); + + for (const auto i : c10::irange(begin, end)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + int64_t kd = id1 - id0; + + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t* out = output_data + i * channels; + int64_t size = channels; + + // Pass I: zero the out lane + int64_t d1 = 0; + for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) { + fVec sum_fvec = fVec(float(0)); + sum_fvec.store(sum + d1); + } + for (; d1 < size; d1++) { + sum[d1] = float(0); + } + // Pass II: compute local sum + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + scalar_t* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) { + bVec data_bvec = bVec::loadu(in + d2); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + + fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0; + fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1; + sum_fvec0.store(sum + d2); + sum_fvec1.store(sum + d2 + fVec::size()); + } + for (; d2 < size; d2++) { + sum[d2] += float(in[d2]); + } + } + } + } + // Pass III: compute local average + int64_t d3 = 0; + for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) { + fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(kd * kh * kw)); + fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(kd * kh * kw)); + + bVec out_bvec = convert_from_float(out_fvec0, out_fvec1); + out_bvec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = scalar_t(sum[d3] / kd / kh / kw); + } + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template +void cpu_adaptive_avg_pool3d_backward( + Tensor& grad_input_, + const Tensor& grad_output_) { + auto grad_output = grad_output_.contiguous(); + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.mutable_data_ptr(); + + int64_t ndim = grad_output.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + int64_t input_depth = grad_input.size(-3); + int64_t input_height = grad_input.size(-2); + int64_t input_width = grad_input.size(-1); + int64_t output_depth = grad_output.size(-3); + int64_t output_height = grad_output.size(-2); + int64_t output_width = grad_output.size(-1); + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width; + scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width; + + for (const auto od : c10::irange(output_depth)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + int64_t kd = id1 - id0; + for (const auto oh : c10::irange(output_height)) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (const auto ow : c10::irange(output_width)) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t grad_delta = grad_output_ptr[od * output_width * output_height + oh * output_width + ow] / kd / kh / kw; + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta; + } + } + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_adaptive_avg_pool3d_backward_channels_last( + Tensor& grad_input_, + const Tensor& grad_output_) { + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto grad_input = grad_input_.contiguous(memory_format); + auto grad_output = grad_output_.contiguous(memory_format); + + auto grad_input_data = grad_input.mutable_data_ptr(); + auto grad_output_data = grad_output.data_ptr(); + + int64_t nbatch = grad_input.size(0); + int64_t channels = grad_input.size(1); + int64_t input_depth = grad_input.size(2); + int64_t input_height = grad_input.size(3); + int64_t input_width = grad_input.size(4); + int64_t output_depth = grad_output.size(2); + int64_t output_height = grad_output.size(3); + int64_t output_width = grad_output.size(4); + + using Vec = vec::Vectorized; + // parallel on dim N + at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { + for (const auto n : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels; + scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels; + + for (const auto od : c10::irange(output_depth)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + int64_t kd = id1 - id0; + for (const auto oh : c10::irange(output_height)) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + int64_t kh = ih1 - ih0; + + for (const auto ow : c10::irange(output_width)) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + int64_t kw = iw1 - iw0; + + scalar_t* gout = grad_output_ptr + od * output_depth * channels + oh * output_width * channels + ow * channels; + int64_t size = channels; + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + scalar_t* gin = grad_input_ptr + id * input_width * input_height * channels + ih * input_width * channels + iw * channels; + + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(kd * kh * kw)); + gin_vec.store(gin + d); + } + for (; d < size; d++) { + gin[d] += gout[d] / kd / kh / kw; + } + } + } + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous(memory_format)) { + grad_input_.copy_(grad_input); + } +} + + +void adaptive_avg_pool3d_kernel_impl( + Tensor& output, + const Tensor& input, + IntArrayRef output_size) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d", [&] { + using param_t = at::opmath_type; + cpu_adaptive_avg_pool3d(output, input, output_size); + }); + break; + } + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_avg_pool3d_channels_last", [&]{ + cpu_adaptive_avg_pool3d_channels_last(output, input, output_size); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void adapative_avg_pool3d_backward_kernel_impl( + Tensor& grad_input, + const Tensor& grad_output) { + switch (grad_output.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward", [&] { + cpu_adaptive_avg_pool3d_backward(grad_input, grad_output); + }); + break; + } + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_avg_pool3d_backward_channels_last", [&]{ + cpu_adaptive_avg_pool3d_backward_channels_last(grad_input, grad_output); }); break; } @@ -413,5 +858,7 @@ void adapative_avg_pool2d_backward_kernel_impl( REGISTER_DISPATCH(adaptive_avg_pool2d_kernel, &adaptive_avg_pool2d_kernel_impl); REGISTER_DISPATCH(adaptive_avg_pool2d_backward_kernel, &adapative_avg_pool2d_backward_kernel_impl); +REGISTER_DISPATCH(adaptive_avg_pool3d_kernel, &adaptive_avg_pool3d_kernel_impl); +REGISTER_DISPATCH(adaptive_avg_pool3d_backward_kernel, &adapative_avg_pool3d_backward_kernel_impl); } // at::native diff --git a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp index 923f0a7034b85..2306fd05d132a 100644 --- a/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AdaptiveMaxPoolKernel.cpp @@ -15,7 +15,7 @@ namespace at::native { namespace { template -void cpu_adaptive_max_pool( +void cpu_adaptive_max_pool2d( const Tensor& output_, const Tensor& indices_, const Tensor& input_, @@ -24,7 +24,7 @@ void cpu_adaptive_max_pool( auto output = output_.contiguous(); auto indices = indices_.contiguous(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -39,7 +39,7 @@ void cpu_adaptive_max_pool( // parallel on dim of N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { - scalar_t* input_ptr = input_data + c * input_height * input_width; + const scalar_t* input_ptr = input_data + c * input_height * input_width; scalar_t* output_ptr = output_data + c * output_height * output_width; int64_t* indices_ptr = indices_data + c * output_height * output_width; @@ -83,19 +83,19 @@ void cpu_adaptive_max_pool( template typename std::enable_if_t>, void> -cpu_adaptive_max_pool_channels_last( +cpu_adaptive_max_pool2d_channels_last( const Tensor& output_, const Tensor& indices_, const Tensor& input_, IntArrayRef output_size) { TORCH_CHECK(input_.ndimension() == 4, - "adaptive max pooling with channels last format supports tensors with 4 dims"); + "2d adaptive max pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -109,7 +109,7 @@ cpu_adaptive_max_pool_channels_last( using Vec = vec::Vectorized; using integer_t = vec::int_same_size_t; using iVec = vec::Vectorized; - // for the convience of vectorization, use integer of the same size of scalar_t, + // for the convenience of vectorization, use integer of the same size of scalar_t, // e.g. int32_t for float, int64_t for double // need to make sure doesn't overflow TORCH_CHECK(input_height * input_width <= std::numeric_limits::max()); @@ -151,7 +151,7 @@ cpu_adaptive_max_pool_channels_last( // Pass II: compute local max for (int64_t ih = ih0; ih < ih1; ih ++) { for (int64_t iw = iw0; iw < iw1; iw ++) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -200,19 +200,19 @@ cpu_adaptive_max_pool_channels_last( template typename std::enable_if_t>, void> -cpu_adaptive_max_pool_channels_last( +cpu_adaptive_max_pool2d_channels_last( const Tensor& output_, const Tensor& indices_, const Tensor& input_, IntArrayRef output_size) { TORCH_CHECK(input_.ndimension() == 4, - "adaptive max pooling with channels last format supports tensors with 4 dims"); + "2d adaptive max pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -269,7 +269,7 @@ cpu_adaptive_max_pool_channels_last( // Pass II: compute local max for (int64_t ih = ih0; ih < ih1; ih ++) { for (int64_t iw = iw0; iw < iw1; iw ++) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -340,7 +340,7 @@ cpu_adaptive_max_pool_channels_last( } template -void cpu_adaptive_max_pool_backward( +void cpu_adaptive_max_pool2d_backward( const Tensor& grad_input_, const Tensor& grad_output_, const Tensor& indices_) { @@ -348,8 +348,8 @@ void cpu_adaptive_max_pool_backward( auto indices = indices_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); - auto indices_data = indices.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); int64_t ndim = grad_output.ndimension(); @@ -364,8 +364,8 @@ void cpu_adaptive_max_pool_backward( at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; - scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; - int64_t* indices_ptr = indices_data + c * output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; + const int64_t* indices_ptr = indices_data + c * output_height * output_width; for (const auto oh : c10::irange(output_height)) { for (const auto ow : c10::irange(output_width)) { @@ -386,20 +386,20 @@ void cpu_adaptive_max_pool_backward( } template -void cpu_adaptive_max_pool_backward_channels_last( +void cpu_adaptive_max_pool2d_backward_channels_last( const Tensor& grad_input_, const Tensor& grad_output_, const Tensor& indices_) { TORCH_CHECK(grad_output_.ndimension() == 4, - "adaptive max pooling backward with channels last format supports tensors with 4 dims."); + "2d adaptive max pooling backward with channels last format supports tensors with 4 dims."); auto memory_format = at::MemoryFormat::ChannelsLast; auto grad_input = grad_input_.contiguous(memory_format); auto grad_output = grad_output_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); auto grad_input_data = grad_input.mutable_data_ptr(); - auto grad_output_data = grad_output.data_ptr(); - auto indices_data = indices.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); int64_t nbatch = grad_input.size(0); int64_t channels = grad_input.size(1); @@ -412,13 +412,13 @@ void cpu_adaptive_max_pool_backward_channels_last( at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; - scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; - int64_t* indices_ptr = indices_data + n * output_height * output_width * channels; + const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; + const int64_t* indices_ptr = indices_data + n * output_height * output_width * channels; for (const auto oh : c10::irange(output_height)) { for (const auto ow : c10::irange(output_width)) { - scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; - int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels; + const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; + const int64_t* ind = indices_ptr + oh * output_width * channels + ow * channels; // TODO: gcc vectorization for (const auto c : c10::irange(channels)) { int64_t maxindex = ind[c]; @@ -443,13 +443,13 @@ void adaptive_max_pool2d_kernel_impl( case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d", [&] { using param_t = at::opmath_type; - cpu_adaptive_max_pool(output, indices, input, output_size); + cpu_adaptive_max_pool2d(output, indices, input, output_size); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool2d_channels_last", [&]{ - cpu_adaptive_max_pool_channels_last(output, indices, input, output_size); + cpu_adaptive_max_pool2d_channels_last(output, indices, input, output_size); }); break; } @@ -466,13 +466,512 @@ void adaptive_max_pool2d_backward_kernel_impl( switch (grad_input.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward", [&] { - cpu_adaptive_max_pool_backward(grad_input, grad_output, indices); + cpu_adaptive_max_pool2d_backward(grad_input, grad_output, indices); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool2d_backward_channels_last", [&]{ - cpu_adaptive_max_pool_backward_channels_last(grad_input, grad_output, indices); + cpu_adaptive_max_pool2d_backward_channels_last(grad_input, grad_output, indices); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +template +void cpu_adaptive_max_pool3d( + const Tensor& output_, + const Tensor& indices_, + const Tensor& input_, + IntArrayRef output_size) { + auto input = input_.contiguous(); + auto output = output_.contiguous(); + auto indices = indices_.contiguous(); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + auto indices_data = indices.data_ptr(); + + int64_t ndim = input.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1); + int64_t input_depth = input.size(-3); + int64_t input_height = input.size(-2); + int64_t input_width = input.size(-1); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width; + scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width; + int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width; + + for (const auto od : c10::irange(output_depth)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + for (const auto oh : c10::irange(output_height)) { + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + + for (const auto ow : c10::irange(output_width)) { + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + + // compute local max + int64_t maxindex = id0 * input_height * input_width + ih0 * input_width + iw0; + accscalar_t maxval = -std::numeric_limits::infinity(); + for (int64_t id = id0; id < id1; id ++) { + for (int64_t ih = ih0; ih < ih1; ih ++) { + for (int64_t iw = iw0; iw < iw1; iw ++) { + int64_t index = id * input_height * input_width + ih * input_width + iw; + scalar_t val = input_ptr[index]; + if ((val > maxval) || std::isnan(val)) { + maxval = val; + maxindex = index; + } + } + } + } + + // set output to local max and store location of max + output_ptr[od * output_height * output_width + oh * output_width + ow] = maxval; + indices_ptr[od * output_height * output_width + oh * output_width + ow] = scalar_t(maxindex); + } + } + } + } + }); + + if (!output_.is_contiguous()) { + output_.copy_(output); + } + if (!indices_.is_contiguous()) { + indices_.copy_(indices); + } +} + +template +typename std::enable_if_t>, void> +cpu_adaptive_max_pool3d_channels_last( + const Tensor& output_, + const Tensor& indices_, + const Tensor& input_, + IntArrayRef output_size) { + TORCH_CHECK(input_.ndimension() == 5, + "3d adaptive max pooling with channels last format supports tensors with 5 dims"); + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + auto indices = indices_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + auto indices_data = indices.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + using Vec = vec::Vectorized; + using integer_t = vec::int_same_size_t; + using iVec = vec::Vectorized; + // for the convience of vectorization, use integer of the same size of scalar_t, + // e.g. int32_t for float, int64_t for double + // need to make sure doesn't overflow + TORCH_CHECK(input_height * input_width <= std::numeric_limits::max()); + + // parallel on dim of N, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + int64_t size = channels; + int64_t len = size - (size % Vec::size()); + // temp buffer holding index with integer_t + auto index_buffer = std::make_unique(len); + + for (const auto i : c10::irange(begin, end)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + + scalar_t* out = output_data + i * channels; + int64_t* ind = indices_data + i * channels; + + // Pass I: init out lane + iVec index0_vec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0); + Vec out_vec = Vec(-std::numeric_limits::infinity()); + int64_t d1 = 0; + for (; d1 < len; d1 += Vec::size()) { + index0_vec.store(index_buffer.get() + d1); + out_vec.store(out + d1); + } + for (; d1 < size; d1++) { + ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0; + out[d1] = -std::numeric_limits::infinity(); + } + // Pass II: compute local max + for (int64_t id = id0; id < id1; id ++) { + for (int64_t ih = ih0; ih < ih1; ih ++) { + for (int64_t iw = iw0; iw < iw1; iw ++) { + scalar_t* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < len; d2 += Vec::size()) { + iVec index_vec = iVec(id * input_height * input_width + ih * input_width + iw); + Vec val_vec = Vec::loadu(in + d2); + iVec maxindex_vec = iVec::loadu(index_buffer.get() + d2); + Vec maxval_vec = Vec::loadu(out + d2); + + // true = all ones, false = all zeros + Vec mask = (val_vec > maxval_vec) | val_vec.isnan(); + iVec imask = vec::cast(mask); + Vec out_vec = Vec::blendv(maxval_vec, val_vec, mask); + iVec ind_vec = iVec::blendv(maxindex_vec, index_vec, imask); + + out_vec.store(out + d2); + ind_vec.store(index_buffer.get() + d2); + } + for (; d2 < size; d2++) { + int64_t index = id * input_height * input_width + ih * input_width + iw; + scalar_t val = in[d2]; + int64_t maxindex = ind[d2]; + scalar_t maxval = out[d2]; + + bool mask = (val > maxval) || std::isnan(val); + out[d2] = mask ? val : maxval; + ind[d2] = mask ? index : maxindex; + } + } + } + } + // convert indice data type + vec::convert(index_buffer.get(), ind, len); + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } + if (!indices_.is_contiguous(memory_format)) { + indices_.copy_(indices); + } +} + +template +typename std::enable_if_t>, void> +cpu_adaptive_max_pool3d_channels_last( + const Tensor& output_, + const Tensor& indices_, + const Tensor& input_, + IntArrayRef output_size) { + TORCH_CHECK(input_.ndimension() == 5, + "3d adaptive max pooling with channels last format supports tensors with 5 dims"); + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + auto indices = indices_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + auto indices_data = indices.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output_size[0]; + int64_t output_height = output_size[1]; + int64_t output_width = output_size[2]; + + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + using iVec = vec::Vectorized; + // need to make sure doesn't overflow + TORCH_CHECK(input_height * input_width <= std::numeric_limits::max()); + + // parallel on dim of N, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + int64_t size = channels; + int64_t len = size - (size % bVec::size()); + // temp buffer holding index with integer_t + auto index_buffer = std::make_unique(len); + // temp buffer holding max value with float + auto max_arr = std::make_unique(size); + float* max = max_arr.get(); + + for (const auto i : c10::irange(begin, end)) { + int64_t id0 = start_index(od, output_depth, input_depth); + int64_t id1 = end_index(od, output_depth, input_depth); + + int64_t ih0 = start_index(oh, output_height, input_height); + int64_t ih1 = end_index(oh, output_height, input_height); + + int64_t iw0 = start_index(ow, output_width, input_width); + int64_t iw1 = end_index(ow, output_width, input_width); + + BFloat16* out = output_data + i * channels; + int64_t* ind = indices_data + i * channels; + + // Pass I: init out lane + iVec index0_ivec = iVec(id0 * input_height * input_width + ih0 * input_width + iw0); + fVec max_fvec = fVec(-std::numeric_limits::infinity()); + int64_t d1 = 0; + for (; d1 < len; d1 += fVec::size()) { + index0_ivec.store(index_buffer.get() + d1); + max_fvec.store(max + d1); + } + for (; d1 < size; d1++) { + ind[d1] = id0 * input_height * input_width + ih0 * input_width + iw0; + max[d1] = -std::numeric_limits::infinity(); + } + // Pass II: compute local max + for (int64_t id = id0; id < id1; id ++) { + for (int64_t ih = ih0; ih < ih1; ih ++) { + for (int64_t iw = iw0; iw < iw1; iw ++) { + BFloat16* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < len; d2 += bVec::size()) { + iVec index_ivec = iVec(id * input_height * input_width + ih * input_width + iw); + bVec val_bvec = bVec::loadu(in + d2); + fVec val_fvec0, val_fvec1; + std::tie(val_fvec0, val_fvec1) = convert_bfloat16_float(val_bvec); + + iVec maxindex_ivec0 = iVec::loadu(index_buffer.get() + d2); + iVec maxindex_ivec1 = iVec::loadu(index_buffer.get() + d2 + iVec::size()); + fVec maxval_fvec0 = fVec::loadu(max + d2); + fVec maxval_fvec1 = fVec::loadu(max + d2 + fVec::size()); + + // true = all ones, false = all zeros + fVec mask0 = (val_fvec0 > maxval_fvec0) | val_fvec0.isnan(); + fVec mask1 = (val_fvec1 > maxval_fvec1) | val_fvec1.isnan(); + iVec imask0 = vec::cast(mask0); + iVec imask1 = vec::cast(mask1); + + fVec max_fvec0 = fVec::blendv(maxval_fvec0, val_fvec0, mask0); + fVec max_fvec1 = fVec::blendv(maxval_fvec1, val_fvec1, mask1); + iVec ind_ivec0 = iVec::blendv(maxindex_ivec0, index_ivec, imask0); + iVec ind_ivec1 = iVec::blendv(maxindex_ivec1, index_ivec, imask1); + + max_fvec0.store(max + d2); + max_fvec1.store(max + d2 + fVec::size()); + ind_ivec0.store(index_buffer.get() + d2); + ind_ivec1.store(index_buffer.get() + d2 + iVec::size()); + } + for (; d2 < size; d2++) { + int64_t index = id * input_height * input_width + ih * input_width + iw; + float val = float(in[d2]); + int64_t maxindex = ind[d2]; + float maxval = max[d2]; + + bool mask = (val > maxval) || std::isnan(val); + max[d2] = mask ? val : maxval; + ind[d2] = mask ? index : maxindex; + } + } + } + } + // Pass III: convert max values from float to bfloat16 + int64_t d3 = 0; + for (; d3 < len; d3 += bVec::size()) { + fVec max_fvec0 = fVec::loadu(max + d3); + fVec max_fvec1 = fVec::loadu(max + d3 + fVec::size()); + bVec max_bvec = convert_float_bfloat16(max_fvec0, max_fvec1); + max_bvec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = BFloat16(max[d3]); + } + // convert indice data type + vec::convert(index_buffer.get(), ind, len); + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } + if (!indices_.is_contiguous(memory_format)) { + indices_.copy_(indices); + } +} + +template +void cpu_adaptive_max_pool3d_backward( + const Tensor& grad_input_, + const Tensor& grad_output_, + const Tensor& indices_) { + auto grad_output = grad_output_.contiguous(); + auto indices = indices_.contiguous(); + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto indices_data = indices.data_ptr(); + auto grad_input_data = grad_input.mutable_data_ptr(); + + int64_t ndim = grad_output.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 3 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + int64_t input_depth = grad_input.size(-3); + int64_t input_height = grad_input.size(-2); + int64_t input_width = grad_input.size(-1); + int64_t output_depth = grad_output.size(-3); + int64_t output_height = grad_output.size(-2); + int64_t output_width = grad_output.size(-1); + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width; + scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width; + int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width; + + for (const auto od : c10::irange(output_depth)) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { + // retrieve position of max + int64_t index = od * output_height * output_width + oh * output_width + ow; + int64_t maxindex = indices_ptr[index]; + + // update gradient + grad_input_ptr[maxindex] += grad_output_ptr[index]; + } + } + } + } + }); + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_adaptive_max_pool3d_backward_channels_last( + const Tensor& grad_input_, + const Tensor& grad_output_, + const Tensor& indices_) { + TORCH_CHECK(grad_output_.ndimension() == 5, + "3d adaptive max pooling backward with channels last format supports tensors with 5 dims."); + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto grad_input = grad_input_.contiguous(memory_format); + auto grad_output = grad_output_.contiguous(memory_format); + auto indices = indices_.contiguous(memory_format); + + auto grad_input_data = grad_input.mutable_data_ptr(); + auto grad_output_data = grad_output.data_ptr(); + auto indices_data = indices.data_ptr(); + + int64_t nbatch = grad_input.size(0); + int64_t channels = grad_input.size(1); + int64_t input_depth = grad_input.size(2); + int64_t input_height = grad_input.size(3); + int64_t input_width = grad_input.size(4); + int64_t output_depth = grad_output.size(2); + int64_t output_height = grad_output.size(3); + int64_t output_width = grad_output.size(4); + + // parallel on dim N + at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { + for (const auto n : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels; + scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels; + int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels; + + for (const auto od : c10::irange(output_depth)) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { + scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels; + int64_t* ind = indices_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels; + // TODO: gcc vectorization + for (const auto c : c10::irange(channels)) { + int64_t maxindex = ind[c]; + grad_input_ptr[maxindex * channels + c] += gout[c]; + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous(memory_format)) { + grad_input_.copy_(grad_input); + } +} + +void adaptive_max_pool3d_kernel_impl( + const Tensor& output, + const Tensor& indices, + const Tensor& input, + IntArrayRef output_size) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d", [&] { + using param_t = at::opmath_type; + cpu_adaptive_max_pool3d(output, indices, input, output_size); + }); + break; + } + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, input.scalar_type(), "adaptive_max_pool3d_channels_last", [&]{ + cpu_adaptive_max_pool3d_channels_last(output, indices, input, output_size); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + +void adaptive_max_pool3d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + const Tensor& indices) { + // can't use grad_output memory format to switch here since grad_output might be NC11 + switch (grad_input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward", [&] { + cpu_adaptive_max_pool3d_backward(grad_input, grad_output, indices); + }); + break; + } + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::BFloat16, ScalarType::Half, grad_output.scalar_type(), "adaptive_max_pool3d_backward_channels_last", [&]{ + cpu_adaptive_max_pool3d_backward_channels_last(grad_input, grad_output, indices); }); break; } @@ -485,5 +984,7 @@ void adaptive_max_pool2d_backward_kernel_impl( REGISTER_DISPATCH(adaptive_max_pool2d_kernel, &adaptive_max_pool2d_kernel_impl); REGISTER_DISPATCH(adaptive_max_pool2d_backward_kernel, &adaptive_max_pool2d_backward_kernel_impl); +REGISTER_DISPATCH(adaptive_max_pool3d_kernel, &adaptive_max_pool3d_kernel_impl); +REGISTER_DISPATCH(adaptive_max_pool3d_backward_kernel, &adaptive_max_pool3d_backward_kernel_impl); } // at::native diff --git a/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp b/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp new file mode 100644 index 0000000000000..005b9c15060cc --- /dev/null +++ b/aten/src/ATen/native/cpu/AmpGradScalerKernels.cpp @@ -0,0 +1,198 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +namespace { +// Follow the implementations of CUDA. +// Multiplies each tensor in scaled_grads by inv_scale in-place. +// If any element of any tensor in scaled_grads is inf or NaN, sets found_inf +// to 1.0. +// +// Args: +// scaled_grads: A TensorList of scaled gradient tensors. May contain infs or +// NaNs. found_inf: A single-element float tensor to which 1.0 will be written +// if any gradient contain infs/nans. +// Pre-zeroing found_inf, if appropriate, is the responsibility of +// the caller. +// inv_scale: The inverse of the scale factor by which scaled_grads are +// currently multiplied. +void _amp_foreach_non_finite_check_and_unscale_cpu_kernel( + TensorList scaled_grads, + at::Tensor& found_inf, + const at::Tensor& inv_scale) { + if (scaled_grads.size() == 0) { + return; + } + + TORCH_CHECK(inv_scale.is_cpu(), "inv_scale must be a CPU tensor."); + TORCH_CHECK(found_inf.is_cpu(), "found_inf must be a CPU tensor."); + TORCH_CHECK(inv_scale.numel() == 1, "inv_scale must be a 1-element tensor."); + TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor."); + TORCH_CHECK( + inv_scale.scalar_type() == at::ScalarType::Float, + "inv_scale must be a float tensor."); + TORCH_CHECK( + found_inf.scalar_type() == at::ScalarType::Float, + "found_inf must be a float tensor."); + + // Ensures client code (GradScaler) filtered scaled_grads by dtype. + at::native::check_foreach_api_restrictions(scaled_grads); + for (const at::Tensor& t : scaled_grads) { + TORCH_CHECK(t.is_cpu(), "one of scaled_grads was not a CPU tensor."); + TORCH_CHECK( + t.layout() == at::kStrided, + "one of scaled_grads was not a strided tensor."); + auto iter = at::TensorIterator::unary_op( + const_cast(t), const_cast(t)); + if (at::isReducedFloatingType(iter.dtype())) { + AT_DISPATCH_REDUCED_FLOATING_TYPES( + iter.dtype(), + "_amp_foreach_non_finite_check_and_unscale_cpu", + [&iter, &found_inf, &inv_scale] { + auto* found_inf_ptr = found_inf.data_ptr(); + auto* inv_scale_ptr = inv_scale.data_ptr(); + + using opmath_t = at::opmath_type; + + at::native::cpu_kernel_vec( + iter, + [found_inf_ptr, inv_scale_ptr](scalar_t val_in) -> scalar_t { + auto val = static_cast(val_in); + if (!std::isfinite(val)) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + return static_cast( + inv_scale_val == 1.f ? val : val * inv_scale_val); + }, + [found_inf_ptr, inv_scale_ptr](Vectorized val_vec) -> Vectorized{ + auto [val_vec0, val_vec1] = convert_to_float(val_vec); + if (val_vec0.has_inf_nan() || val_vec1.has_inf_nan()) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + val_vec0 = inv_scale_val == 1.f ? val_vec0 : val_vec0 * Vectorized(inv_scale_val); + val_vec1 = inv_scale_val == 1.f ? val_vec1 : val_vec1 * Vectorized(inv_scale_val); + return convert_from_float(val_vec0, val_vec1); + }); + }); + } else { + AT_DISPATCH_FLOATING_TYPES( + iter.dtype(), + "_amp_foreach_non_finite_check_and_unscale_cpu", + [&iter, &found_inf, &inv_scale] { + auto* found_inf_ptr = found_inf.data_ptr(); + auto* inv_scale_ptr = inv_scale.data_ptr(); + at::native::cpu_kernel_vec( + iter, + [found_inf_ptr, inv_scale_ptr](scalar_t val_in) -> scalar_t { + if (!std::isfinite(val_in)) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + return static_cast( + inv_scale_val == 1.f ? val_in : val_in * inv_scale_val); + }, + [found_inf_ptr, inv_scale_ptr](Vectorized val_vec) -> Vectorized{ + if (val_vec.has_inf_nan()) { + *found_inf_ptr = 1.f; + } + // Every thread accesses inv_scale, but it will hit in cache. + const auto inv_scale_val = *inv_scale_ptr; + return inv_scale_val == 1.f ? val_vec : val_vec * Vectorized(inv_scale_val); + }); + }); + } + } +} + +// _amp_update_scale_cpu updates the scale tensor in place. +// +// Args: +// current_scale: A one-element float tensor containing the scale value. +// growth_tracker: A one-element IntTensor containing the number of recent +// consecutive unskipped steps. found_inf: A one-element float tensor. If > 0, +// indicates that infs/nans were found by the relevant +// prior _amp_non_finite_check_and_unscale_cpu call, and 0 if no +// infs/nans were found. +// growth_factor: Multiplier if no infs/NaNs were found (typically slightly > +// 1). backoff_factor: Multiplier if infs/NaNs were found (typically 0.5). +// growth_interval: Number of consecutive unskipped steps that must occur for +// current_scale to be multiplied by +// growth_factor. +// +// Returns: +// current_scale +at::Tensor& _amp_update_scale_cpu_kernel( + at::Tensor& current_scale, + at::Tensor& growth_tracker, + const at::Tensor& found_inf, + double growth_factor, + double backoff_factor, + int64_t growth_interval) { + TORCH_CHECK(growth_tracker.is_cpu(), "growth_tracker must be a CPU tensor."); + TORCH_CHECK(current_scale.is_cpu(), "current_scale must be a CPU tensor."); + TORCH_CHECK(found_inf.is_cpu(), "found_inf must be a CPU tensor."); + TORCH_CHECK( + growth_tracker.numel() == 1, + "growth_tracker must be a 1-element tensor."); + TORCH_CHECK( + current_scale.numel() == 1, "current_scale must be a 1-element tensor."); + TORCH_CHECK(found_inf.numel() == 1, "found_inf must be a 1-element tensor."); + TORCH_CHECK( + growth_tracker.scalar_type() == at::ScalarType::Int, + "growth_tracker must be an int tensor."); + TORCH_CHECK( + current_scale.scalar_type() == at::ScalarType::Float, + "current_scale must be a float tensor."); + TORCH_CHECK( + found_inf.scalar_type() == at::ScalarType::Float, + "found_inf must be a float tensor."); + + float* current_scale_ptr = current_scale.data_ptr(); + int* growth_tracker_ptr = growth_tracker.data_ptr(); + float* found_inf_ptr = found_inf.data_ptr(); + + if (*found_inf_ptr) { + *current_scale_ptr = (*current_scale_ptr) * backoff_factor; + *growth_tracker_ptr = 0; + } else { + // Entering this branch means we just carried out a successful step, + // so growth_tracker is incremented before comparing to growth_interval. + auto successful = (*growth_tracker_ptr) + 1; + if (successful == growth_interval) { + auto new_scale = static_cast((*current_scale_ptr) * growth_factor); + // Do not grow the scale past fp32 bounds to inf. + if (std::isfinite(new_scale)) { + *current_scale_ptr = new_scale; + } + *growth_tracker_ptr = 0; + } else { + *growth_tracker_ptr = successful; + } + } + + return current_scale; +} + +} // namespace + +REGISTER_DISPATCH(_amp_foreach_non_finite_check_and_unscale_cpu_stub, &_amp_foreach_non_finite_check_and_unscale_cpu_kernel); +REGISTER_DISPATCH(_amp_update_scale_cpu_stub, &_amp_update_scale_cpu_kernel); + +} // namespace at::native diff --git a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp index 67d9eda485ffd..572d5af43f651 100644 --- a/aten/src/ATen/native/cpu/AvgPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/AvgPoolKernel.cpp @@ -14,7 +14,7 @@ namespace at::native { namespace { template -void cpu_avg_pool( +void cpu_avg_pool2d( const Tensor& output_, const Tensor& input_, int64_t kW, int64_t kH, @@ -27,7 +27,7 @@ void cpu_avg_pool( auto input = input_.contiguous(); auto output = output_.contiguous(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t numel = output.numel(); @@ -50,7 +50,7 @@ void cpu_avg_pool( output_data[i] = static_cast(0); // local pointers - scalar_t* input_ptr = input_data + c * input_height * input_width; + const scalar_t* input_ptr = input_data + c * input_height * input_width; // compute the mean of the input image... int64_t ih0 = oh * dH - padH; @@ -101,7 +101,7 @@ void cpu_avg_pool( template ::value, int>::type = 0> -void cpu_avg_pool_channels_last( +void cpu_avg_pool2d_channels_last( const Tensor& output_, const Tensor& input_, int64_t kW, int64_t kH, @@ -110,12 +110,12 @@ void cpu_avg_pool_channels_last( bool count_include_pad, c10::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 4, - "average pooling with channels last format supports tensors with 4 dims"); + "2d average pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -179,7 +179,7 @@ void cpu_avg_pool_channels_last( // Pass II: compute local sum for (const auto ih : c10::irange(ih0, ih1)) { for (const auto iw : c10::irange(iw0, iw1)) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -215,7 +215,7 @@ void cpu_avg_pool_channels_last( template ::value, int>::type = 0> -void cpu_avg_pool_channels_last( +void cpu_avg_pool2d_channels_last( const Tensor& output_, const Tensor& input_, int64_t kW, int64_t kH, @@ -224,12 +224,12 @@ void cpu_avg_pool_channels_last( bool count_include_pad, c10::optional divisor_override) { TORCH_CHECK(input_.ndimension() == 4, - "average pooling with channels last format supports tensors with 4 dims"); + "2d average pooling with channels last format supports tensors with 4 dims"); auto memory_format = at::MemoryFormat::ChannelsLast; auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -303,7 +303,7 @@ void cpu_avg_pool_channels_last( // Pass II: compute local sum for (const auto ih : c10::irange(ih0, ih1)) { for (const auto iw : c10::irange(iw0, iw1)) { - scalar_t* in = input_data + n * input_height * input_width * channels + + const scalar_t* in = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; int64_t d2 = 0; @@ -347,7 +347,7 @@ void cpu_avg_pool_channels_last( } template -void cpu_avg_pool_backward( +void cpu_avg_pool2d_backward( const Tensor& grad_input_, const Tensor& grad_output_, int kW, int kH, @@ -358,7 +358,7 @@ void cpu_avg_pool_backward( auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); int64_t ndim = grad_output.ndimension(); @@ -373,7 +373,7 @@ void cpu_avg_pool_backward( at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; - scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; for (const auto oh : c10::irange(output_height)) { for (const auto ow : c10::irange(output_width)) { @@ -415,7 +415,7 @@ void cpu_avg_pool_backward( } template -void cpu_avg_pool_backward_channels_last( +void cpu_avg_pool2d_backward_channels_last( const Tensor& grad_input_, const Tensor& grad_output_, int kW, int kH, @@ -428,7 +428,7 @@ void cpu_avg_pool_backward_channels_last( auto grad_output = grad_output_.contiguous(memory_format); auto grad_input_data = grad_input.mutable_data_ptr(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); int64_t nbatch = grad_input.size(0); int64_t channels = grad_input.size(1); @@ -442,7 +442,7 @@ void cpu_avg_pool_backward_channels_last( at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_height * input_width * channels; - scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; + const scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; for (const auto oh : c10::irange(output_height)) { for (const auto ow : c10::irange(output_width)) { @@ -463,11 +463,11 @@ void cpu_avg_pool_backward_channels_last( if(count_include_pad) { divide_factor = pool_size; } else { - divide_factor = (ih1 - ih0) * (iw1 - iw0); + divide_factor = (ih1 - ih0) * (iw1 - iw0); } } - scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; + const scalar_t* gout = grad_output_ptr + oh * output_width * channels + ow * channels; int64_t size = channels; int64_t len = size - (size % Vec::size()); for (const auto ih : c10::irange(ih0, ih1)) { @@ -505,13 +505,13 @@ void avg_pool2d_kernel_impl( switch (input.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d", [&] { - cpu_avg_pool(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); + cpu_avg_pool2d(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool2d_channels_last", [&] { - cpu_avg_pool_channels_last(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); + cpu_avg_pool2d_channels_last(output, input, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); }); break; } @@ -531,13 +531,569 @@ void avg_pool2d_backward_kernel_impl( switch (grad_output.suggest_memory_format()) { case at::MemoryFormat::Contiguous: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward", [&] { - cpu_avg_pool_backward(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); + cpu_avg_pool2d_backward(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); }); break; } case at::MemoryFormat::ChannelsLast: { AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool2d_backward_channels_last", [&] { - cpu_avg_pool_backward_channels_last(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); + cpu_avg_pool2d_backward_channels_last(grad_input, grad_output, kW, kH, dW, dH, padW, padH, count_include_pad, divisor_override); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + + +template +void cpu_avg_pool3d( + const Tensor& output_, + const Tensor& input_, + int64_t kW, int64_t kH, int64_t kD, + int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, + bool count_include_pad, + c10::optional divisor_override) { + using acc_t = at::opmath_type; + + auto input = input_.contiguous(); + auto output = output_.contiguous(); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t numel = output.numel(); + int64_t ndim = input.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 4 ? input.size(0) : input.size(0) * input.size(1); + int64_t input_depth = input.size(-3); + int64_t input_height = input.size(-2); + int64_t input_width = input.size(-1); + int64_t output_depth = output.size(-3); + int64_t output_height = output.size(-2); + int64_t output_width = output.size(-1); + + // parallel on dim N, C, D, H, W + at::parallel_for(0, numel, 0, [&](int64_t begin, int64_t end) { + int64_t c = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, c, channels, od, output_depth, oh, output_height, ow, output_width); + + for (const auto i : c10::irange(begin, end)) { + output_data[i] = static_cast(0); + + // local pointers + scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width; + + // compute the mean of the input image... + int64_t id0 = od * dD - padD; + int64_t ih0 = oh * dH - padH; + int64_t iw0 = ow * dW - padW; + int64_t id1 = std::min(id0 + kD, input_depth + padD); + int64_t ih1 = std::min(ih0 + kH, input_height + padH); + int64_t iw1 = std::min(iw0 + kW, input_width + padW); + int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + id0 = std::max(id0, (int64_t) 0); + ih0 = std::max(ih0, (int64_t) 0); + iw0 = std::max(iw0, (int64_t) 0); + id1 = std::min(id1, input_depth); + ih1 = std::min(ih1, input_height); + iw1 = std::min(iw1, input_width); + + if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) { + // move on to next output index + data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width); + continue; + } + + acc_t sum = 0; + + int64_t divide_factor; + if (divisor_override.has_value()) { + divide_factor = divisor_override.value(); + } else { + if(count_include_pad) { + divide_factor = pool_size; + } else { + divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + } + } + + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + sum += input_ptr[id * input_height * input_width + ih * input_width + iw]; + } + } + } + output_data[i] += scalar_t(sum / divide_factor); + + // move on to next output index + data_index_step(c, channels, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous()) { + output_.copy_(output); + } +} + +template ::value, int>::type = 0> +void cpu_avg_pool3d_channels_last( + const Tensor& output_, + const Tensor& input_, + int64_t kW, int64_t kH, int64_t kD, + int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, + bool count_include_pad, + c10::optional divisor_override) { + TORCH_CHECK(input_.ndimension() == 5, + "3d average pooling with channels last format supports tensors with 5 dims"); + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output.size(2); + int64_t output_height = output.size(3); + int64_t output_width = output.size(4); + + using Vec = vec::Vectorized; + // parallel on dim N, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + int64_t size = channels; + int64_t len = size - (size % Vec::size()); + for (const auto i : c10::irange(begin, end)) { + // compute the mean of the input image... + int64_t id0 = od * dD - padD; + int64_t ih0 = oh * dH - padH; + int64_t iw0 = ow * dW - padW; + int64_t id1 = std::min(id0 + kD, input_depth + padD); + int64_t ih1 = std::min(ih0 + kH, input_height + padH); + int64_t iw1 = std::min(iw0 + kW, input_width + padW); + int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + id0 = std::max(id0, (int64_t) 0); + ih0 = std::max(ih0, (int64_t) 0); + iw0 = std::max(iw0, (int64_t) 0); + id1 = std::min(id1, input_depth); + ih1 = std::min(ih1, input_height); + iw1 = std::min(iw1, input_width); + + int64_t divide_factor; + if (divisor_override.has_value()) { + divide_factor = divisor_override.value(); + } else { + if(count_include_pad) { + divide_factor = pool_size; + } else { + divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + } + } + + scalar_t* out = output_data + i * channels; + + // Pass I: zero the out lane + int64_t d1 = 0; + for (; d1 < len; d1 += Vec::size()) { + Vec out_vec = Vec(scalar_t(0)); + out_vec.store(out + d1); + } + for (; d1 < size; d1++) { + out[d1] = scalar_t(0); + } + + if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) { + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + continue; + } + + // Pass II: compute local sum + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + scalar_t* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < len; d2 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d2) + Vec::loadu(in + d2); + out_vec.store(out + d2); + } + for (; d2 < size; d2++) { + out[d2] += in[d2]; + } + } + } + } + + // Pass III: compute local average + int64_t d3 = 0; + for (; d3 < len; d3 += Vec::size()) { + Vec out_vec = Vec::loadu(out + d3) / Vec(scalar_t(divide_factor)); + out_vec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = out[d3] / divide_factor; + } + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template ::value, int>::type = 0> +void cpu_avg_pool3d_channels_last( + const Tensor& output_, + const Tensor& input_, + int64_t kW, int64_t kH, int64_t kD, + int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, + bool count_include_pad, + c10::optional divisor_override) { + TORCH_CHECK(input_.ndimension() == 5, + "3d average pooling with channels last format supports tensors with 5 dims"); + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto input = input_.contiguous(memory_format); + auto output = output_.contiguous(memory_format); + + auto input_data = input.data_ptr(); + auto output_data = output.data_ptr(); + + int64_t nbatch = input.size(0); + int64_t channels = input.size(1); + int64_t input_depth = input.size(2); + int64_t input_height = input.size(3); + int64_t input_width = input.size(4); + int64_t output_depth = output.size(2); + int64_t output_height = output.size(3); + int64_t output_width = output.size(4); + + using bVec = vec::Vectorized; + using fVec = vec::Vectorized; + // parallel on dim N, H, W + at::parallel_for(0, nbatch * output_depth * output_height * output_width, 0, [&](int64_t begin, int64_t end) { + int64_t n = 0; + int64_t od = 0; + int64_t oh = 0; + int64_t ow = 0; + data_index_init(begin, n, nbatch, od, output_depth, oh, output_height, ow, output_width); + + // temp buffer for sum, use float as accumulation type + // can't reuse output buffer to store sum since it is BFloat16 + auto sum_arr = std::make_unique(channels); + float* sum = sum_arr.get(); + + int64_t size = channels; + for (const auto i : c10::irange(begin, end)) { + // compute the mean of the input image... + int64_t id0 = od * dD - padD; + int64_t ih0 = oh * dH - padH; + int64_t iw0 = ow * dW - padW; + int64_t id1 = std::min(id0 + kD, input_depth + padD); + int64_t ih1 = std::min(ih0 + kH, input_height + padH); + int64_t iw1 = std::min(iw0 + kW, input_width + padW); + int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + id0 = std::max(id0, (int64_t) 0); + ih0 = std::max(ih0, (int64_t) 0); + iw0 = std::max(iw0, (int64_t) 0); + id1 = std::min(id1, input_depth); + ih1 = std::min(ih1, input_height); + iw1 = std::min(iw1, input_width); + + int64_t divide_factor; + if (divisor_override.has_value()) { + divide_factor = divisor_override.value(); + } else { + if(count_include_pad) { + divide_factor = pool_size; + } else { + divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + } + } + + BFloat16* out = output_data + i * channels; + + // Pass I: zero the out lane + int64_t d1 = 0; + for (; d1 < size - (size % fVec::size()); d1 += fVec::size()) { + fVec sum_fvec = fVec(float(0)); + sum_fvec.store(sum + d1); + } + for (; d1 < size; d1++) { + sum[d1] = float(0); + } + + if (id0 >= id1 || ih0 >= ih1 || iw0 >= iw1) { + // since we are not directly using output as the accumulation buffer, + // in case the kernel window is out of range, need to zero the output buffer here. + for (int64_t k = 0; k < size; k++) { + out[k] = 0; + } + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + continue; + } + + // Pass II: compute local sum + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + BFloat16* in = input_data + n * input_depth * input_height * input_width * channels + + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d2 = 0; + for (; d2 < size - (size % bVec::size()); d2 += bVec::size()) { + bVec data_bvec = bVec::loadu(in + d2); + fVec data_fvec0, data_fvec1; + std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + + fVec sum_fvec0 = fVec::loadu(sum + d2) + data_fvec0; + fVec sum_fvec1 = fVec::loadu(sum + d2 + fVec::size()) + data_fvec1; + sum_fvec0.store(sum + d2); + sum_fvec1.store(sum + d2 + fVec::size()); + } + for (; d2 < size; d2++) { + sum[d2] += float(in[d2]); + } + } + } + } + + // Pass III: compute local average + int64_t d3 = 0; + for (; d3 < size - (size % bVec::size()); d3 += bVec::size()) { + fVec out_fvec0 = fVec::loadu(sum + d3) / fVec(float(divide_factor)); + fVec out_fvec1 = fVec::loadu(sum + d3 + fVec::size()) / fVec(float(divide_factor)); + + bVec out_bvec = convert_float_bfloat16(out_fvec0, out_fvec1); + out_bvec.store(out + d3); + } + for (; d3 < size; d3++) { + out[d3] = BFloat16(sum[d3] / divide_factor); + } + + // move on to next output index + data_index_step(n, nbatch, od, output_depth, oh, output_height, ow, output_width); + } + }); + + if (!output_.is_contiguous(memory_format)) { + output_.copy_(output); + } +} + +template +void cpu_avg_pool3d_backward( + const Tensor& grad_input_, + const Tensor& grad_output_, + int kW, int kH, int kD, + int dW, int dH, int dD, + int padW, int padH, int padD, + bool count_include_pad, + c10::optional divisor_override) { + auto grad_output = grad_output_.contiguous(); + auto grad_input = grad_input_.contiguous(); + + auto grad_output_data = grad_output.data_ptr(); + auto grad_input_data = grad_input.mutable_data_ptr(); + + int64_t ndim = grad_output.ndimension(); + // treat batch size and channels as one dimension + int64_t channels = ndim == 4 ? grad_output.size(0) : grad_output.size(0) * grad_output.size(1); + int64_t input_depth = grad_input.size(-3); + int64_t input_height = grad_input.size(-2); + int64_t input_width = grad_input.size(-1); + int64_t output_depth = grad_output.size(-3); + int64_t output_height = grad_output.size(-2); + int64_t output_width = grad_output.size(-1); + + // parallel on dim of N, C + at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { + for (const auto c : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width; + scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width; + + for (const auto od : c10::irange(output_depth)) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { + int64_t id0 = od * dD - padD; + int64_t ih0 = oh * dH - padH; + int64_t iw0 = ow * dW - padW; + int64_t id1 = std::min(id0 + kD, input_depth + padD); + int64_t ih1 = std::min(ih0 + kH, input_height + padH); + int64_t iw1 = std::min(iw0 + kW, input_width + padW); + int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + id0 = std::max(id0, (int64_t) 0); + ih0 = std::max(ih0, (int64_t) 0); + iw0 = std::max(iw0, (int64_t) 0); + ih1 = std::min(ih1, input_height); + iw1 = std::min(iw1, input_width); + + int64_t divide_factor; + if (divisor_override.has_value()) { + divide_factor = divisor_override.value(); + } else { + if(count_include_pad) { + divide_factor = pool_size; + } else { + divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + } + } + + scalar_t grad_delta = grad_output_ptr[od * output_height * output_width + oh * output_width + ow] / divide_factor; + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + grad_input_ptr[id * input_height * input_width + ih * input_width + iw] += grad_delta; + } + } + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } +} + +template +void cpu_avg_pool3d_backward_channels_last( + const Tensor& grad_input_, + const Tensor& grad_output_, + int kW, int kH, int kD, + int dW, int dH, int dD, + int padW, int padH, int padD, + bool count_include_pad, + c10::optional divisor_override) { + auto memory_format = at::MemoryFormat::ChannelsLast3d; + auto grad_input = grad_input_.contiguous(memory_format); + auto grad_output = grad_output_.contiguous(memory_format); + + auto grad_input_data = grad_input.mutable_data_ptr(); + auto grad_output_data = grad_output.data_ptr(); + + int64_t nbatch = grad_input.size(0); + int64_t channels = grad_input.size(1); + int64_t input_depth = grad_input.size(2); + int64_t input_height = grad_input.size(3); + int64_t input_width = grad_input.size(4); + int64_t output_depth = grad_output.size(2); + int64_t output_height = grad_output.size(3); + int64_t output_width = grad_output.size(4); + + using Vec = vec::Vectorized; + // parallel on dim N + at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { + for (const auto n : c10::irange(begin, end)) { + scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels; + scalar_t* grad_output_ptr = grad_output_data + n * output_height * output_width * channels; + + for (const auto od : c10::irange(output_depth)) { + for (const auto oh : c10::irange(output_height)) { + for (const auto ow : c10::irange(output_width)) { + int64_t id0 = od * dD - padD; + int64_t ih0 = oh * dH - padH; + int64_t iw0 = ow * dW - padW; + int64_t id1 = std::min(id0 + kD, input_depth + padD); + int64_t ih1 = std::min(ih0 + kH, input_height + padH); + int64_t iw1 = std::min(iw0 + kW, input_width + padW); + int64_t pool_size = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + id0 = std::max(id0, (int64_t) 0); + ih0 = std::max(ih0, (int64_t) 0); + iw0 = std::max(iw0, (int64_t) 0); + id1 = std::min(id1, input_depth); + ih1 = std::min(ih1, input_height); + iw1 = std::min(iw1, input_width); + + int64_t divide_factor; + if (divisor_override.has_value()) { + divide_factor = divisor_override.value(); + } else { + if(count_include_pad) { + divide_factor = pool_size; + } else { + divide_factor = (id1 - id0) * (ih1 - ih0) * (iw1 - iw0); + } + } + + scalar_t* gout = grad_output_ptr + od * output_height * output_width * channels + oh * output_width * channels + ow * channels; + int64_t size = channels; + int64_t len = size - (size % Vec::size()); + for (const auto id : c10::irange(id0, id1)) { + for (const auto ih : c10::irange(ih0, ih1)) { + for (const auto iw : c10::irange(iw0, iw1)) { + scalar_t* gin = grad_input_ptr + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; + + int64_t d = 0; + for (; d < len; d += Vec::size()) { + Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d) / Vec(scalar_t(divide_factor)); + gin_vec.store(gin + d); + } + for (; d < size; d++) { + gin[d] += gout[d] / divide_factor; + } + } + } + } + } + } + } + } + }); + + if (!grad_input_.is_contiguous(memory_format)) { + grad_input_.copy_(grad_input); + } +} + + + +void avg_pool3d_kernel_impl( + const Tensor& output, + const Tensor& input, + int64_t kW, int64_t kH, int64_t kD, + int64_t dW, int64_t dH, int64_t dD, + int64_t padW, int64_t padH, int64_t padD, + bool count_include_pad, + c10::optional divisor_override) { + switch (input.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d", [&] { + cpu_avg_pool3d(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override); + }); + break; + } + case at::MemoryFormat::ChannelsLast: { + AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, input.scalar_type(), "avg_pool3d_channels_last", [&] { + cpu_avg_pool3d_channels_last(output, input, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override); }); break; } @@ -546,9 +1102,39 @@ void avg_pool2d_backward_kernel_impl( } } + +void avg_pool3d_backward_kernel_impl( + const Tensor& grad_input, + const Tensor& grad_output, + int kW, int kH, int kD, + int dW, int dH, int dD, + int padW, int padH, int padD, + bool count_include_pad, + c10::optional divisor_override) { + switch (grad_output.suggest_memory_format()) { + case at::MemoryFormat::Contiguous: { + AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward", [&] { + cpu_avg_pool3d_backward(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override); + }); + break; + } + case at::MemoryFormat::ChannelsLast3d: { + AT_DISPATCH_FLOATING_TYPES_AND3(kLong, kBFloat16, kHalf, grad_output.scalar_type(), "avg_pool3d_backward_channels_last", [&] { + cpu_avg_pool3d_backward_channels_last(grad_input, grad_output, kW, kH, kD, dW, dH, dD, padW, padH, padD, count_include_pad, divisor_override); + }); + break; + } + default: + TORCH_CHECK(false, "Unsupported memory format. Supports only ChannelsLast, Contiguous"); + } +} + + } // anonymous namespace REGISTER_DISPATCH(avg_pool2d_kernel, &avg_pool2d_kernel_impl); REGISTER_DISPATCH(avg_pool2d_backward_kernel, &avg_pool2d_backward_kernel_impl); +REGISTER_DISPATCH(avg_pool3d_kernel, &avg_pool3d_kernel_impl); +REGISTER_DISPATCH(avg_pool3d_backward_kernel, &avg_pool3d_backward_kernel_impl); } // at::native diff --git a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp index 8958126d107e2..e008113c10b10 100644 --- a/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/BinaryOpsKernel.cpp @@ -32,8 +32,8 @@ inline Vectorized binary_op_scalar( const Vectorized& a, opmath_t b, const Op& op) { - Vectorized a0, a1, vec_b(b); - std::tie(a0, a1) = convert_to_float(a); + Vectorized vec_b(b); + auto [a0, a1] = convert_to_float(a); return convert_from_float(op(a0, vec_b), op(a1, vec_b)); } @@ -90,10 +90,7 @@ void atan2_kernel(TensorIteratorBase& iter) { kHalf, \ kBool, \ kBFloat16, \ - kFloat8_e5m2, \ - kFloat8_e5m2fnuz, \ - kFloat8_e4m3fn, \ - kFloat8_e4m3fnuz, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) + AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) #define _AT_DISPATCH_ALL_TYPES_NO_BOOL(TYPE, NAME, ...) \ AT_DISPATCH_V2( \ TYPE, \ @@ -102,12 +99,10 @@ void atan2_kernel(TensorIteratorBase& iter) { kComplexHalf, \ kHalf, \ kBFloat16, \ - kFloat8_e5m2, \ - kFloat8_e4m3fn, \ - AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) + AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) #define _AT_DISPATCH_MUL_TYPES(TYPE, NAME, ...) \ AT_DISPATCH_V2(TYPE, NAME, AT_WRAP(__VA_ARGS__), \ - kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) + kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)) #else #define _AT_DISPATCH_ALL_TYPES_AND_BOOL(TYPE, NAME, ...) \ AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ @@ -347,9 +342,8 @@ void remainder_kernel(TensorIteratorBase& iter) { return mod0; }, [=](Vectorized a, Vectorized b) { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_bfloat16_float(a); - std::tie(b0, b1) = convert_bfloat16_float(b); + auto [a0, a1] = convert_bfloat16_float(a); + auto [b0, b1] = convert_bfloat16_float(b); auto mod0 = a0.fmod(b0); auto mod1 = a1.fmod(b1); const auto zero = Vectorized(0); @@ -748,9 +742,8 @@ void smooth_l1_kernel(TensorIteratorBase& iter, double beta) { }, [&beta_val_vec, &point_five_vec]( Vectorized a, Vectorized b) { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_bfloat16_float(a); - std::tie(b0, b1) = convert_bfloat16_float(b); + auto [a0, a1] = convert_bfloat16_float(a); + auto [b0, b1] = convert_bfloat16_float(b); auto z = (a0 - b0).abs(); a0 = Vectorized::blendv( point_five_vec * z * z / beta_val_vec, @@ -835,9 +828,8 @@ void sigmoid_backward_kernel(TensorIteratorBase& iter) { return a0 * (float(1) - b0) * b0; }, [=](Vectorized a, Vectorized b) { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_bfloat16_float(a); - std::tie(b0, b1) = convert_bfloat16_float(b); + auto [a0, a1] = convert_bfloat16_float(a); + auto [b0, b1] = convert_bfloat16_float(b); a0 = a0 * (one_vec - b0) * b0; a1 = a1 * (one_vec - b1) * b1; return convert_float_bfloat16(a0, a1); @@ -933,9 +925,8 @@ void tanh_backward_kernel(TensorIteratorBase& iter) { return a0 * (float{1} - b0 * b0); }, [=](Vectorized a, Vectorized b) { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); a0 = a0 * (one_vec - b0 * b0); a1 = a1 * (one_vec - b1 * b1); return convert_from_float(a0, a1); @@ -1017,9 +1008,8 @@ void logaddexp_kernel(TensorIteratorBase& iter) { } }, [=](Vec a, Vec b) -> Vec { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); Vectorized inf(std::numeric_limits::infinity()); Vectorized m0 = maximum(a0, b0); Vectorized m1 = maximum(a1, b1); @@ -1082,9 +1072,8 @@ void logaddexp2_kernel(TensorIteratorBase& iter) { } }, [=](Vec a, Vec b) -> Vec { - Vectorized a0, a1, b0, b1; - std::tie(a0, a1) = convert_to_float(a); - std::tie(b0, b1) = convert_to_float(b); + auto [a0, a1] = convert_to_float(a); + auto [b0, b1] = convert_to_float(b); Vectorized inf(std::numeric_limits::infinity()); Vectorized inv_log_2_vec(inv_log_2); Vectorized m0 = maximum(a0, b0); diff --git a/aten/src/ATen/native/cpu/BlasKernel.cpp b/aten/src/ATen/native/cpu/BlasKernel.cpp index d0761584f0442..1cc53da3584ea 100644 --- a/aten/src/ATen/native/cpu/BlasKernel.cpp +++ b/aten/src/ATen/native/cpu/BlasKernel.cpp @@ -1,9 +1,41 @@ #define TORCH_ASSERT_NO_OPERATORS #include +#include #include +#include #include #include +#if defined(__aarch64__) && !defined(C10_MOBILE) +#include + +namespace at::native::blas_impl { +void fp16_gemv_notrans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy); + +void fp16_gemv_trans( + const int m, + const int n, + const float alpha, + const float16_t* a, + const int lda, + const float16_t* x, + const int incx, + const float beta, + float16_t* y, + const int incy); +} +#endif + namespace at::native { namespace cpublas { namespace { @@ -121,6 +153,7 @@ gemm_notrans_( template void gemm_transa_( + TransposeType transa, int64_t m, int64_t n, int64_t k, opmath_t alpha, const scalar_t *a, int64_t lda, @@ -133,7 +166,7 @@ void gemm_transa_( const scalar_t *b_ = b; for (const auto j : c10::irange(n)) { const auto dot = sum(k, [&](int64_t l) -> opmath_t { - return static_cast(a_[l]) * static_cast(b_[l]); + return static_cast(transa == TransposeType::ConjTranspose ? conj_impl(a_[l]) : a_[l]) * static_cast(b_[l]); }); b_ += ldb; if (beta == opmath_t(0)) { @@ -149,6 +182,7 @@ void gemm_transa_( template typename std::enable_if::value, void>::type gemm_transb_( + TransposeType transb, int64_t m, int64_t n, int64_t k, @@ -166,7 +200,7 @@ gemm_transb_( // c += alpha * (a @ b.T) for (const auto l : c10::irange(k)) { for (const auto j : c10::irange(n)) { - opmath_t val = b[j + l * ldb] * alpha; + opmath_t val = (transb == TransposeType::ConjTranspose ? conj_impl(b[j + l * ldb]) : b[j + l * ldb]) * alpha; int64_t i_m = m / 4; for (const auto i_i : c10::irange(i_m)) { c[j * ldc + i_i * 4 + 0] += a[i_i * 4 + 0 + l * lda] * val; @@ -185,6 +219,7 @@ gemm_transb_( template typename std::enable_if::value, void>::type gemm_transb_( + TransposeType transb, int64_t m, int64_t n, int64_t k, @@ -201,7 +236,7 @@ gemm_transb_( for (const auto j : c10::irange(n)) { const auto dot = sum(k, [&](int64_t l) -> opmath_t { return static_cast(a[l * lda + i]) * - static_cast(b[l * ldb + j]); + static_cast(transb == TransposeType::ConjTranspose ? conj_impl(b[l * ldb + j]) : b[l * ldb + j]); }); if (beta == opmath_t(0)) { c[j * ldc + i] = alpha * dot; @@ -214,6 +249,7 @@ gemm_transb_( template void gemm_transab_( + TransposeType transa, TransposeType transb, int64_t m, int64_t n, int64_t k, opmath_t alpha, const scalar_t *a, int64_t lda, @@ -224,8 +260,8 @@ void gemm_transab_( for (const auto i : c10::irange(m)) { for (const auto j : c10::irange(n)) { const auto dot = sum(k, [&](int64_t l) -> opmath_t { - return static_cast(a[i * lda + l]) * - static_cast(b[l * ldb + j]); + return static_cast(transa == TransposeType::ConjTranspose ? conj_impl(a[i * lda + l]) : a[i * lda + l]) * + static_cast(transb == TransposeType::ConjTranspose ? conj_impl(b[l * ldb + j]) : b[l * ldb + j]); }); if (beta == opmath_t(0)) { @@ -237,6 +273,91 @@ void gemm_transab_( } } +#if defined(__aarch64__) && !defined(C10_MOBILE) +template <> +void gemm_notrans_( + int64_t m, + int64_t n, + int64_t k, + float alpha, + const at::Half* a, + int64_t lda, + const at::Half* b, + int64_t ldb, + float beta, + at::Half* c, + int64_t ldc) { + // c += alpha * (a @ b) + if (n == 1 && beta == 0.0) { + at::native::blas_impl::fp16_gemv_notrans(m, k, alpha, reinterpret_cast(a), lda, reinterpret_cast(b), 1, beta, reinterpret_cast(c), 1); + return; + } + for (const auto i : c10::irange(m)) { + for (const auto j : c10::irange(n)) { + const auto dot = sum(k, [&](int64_t l) -> float { + return float(c10::detail::fp16_from_bits(a[l * lda + i].x)) * + float(c10::detail::fp16_from_bits(b[j * ldb + l].x)); + }); + if (beta == 0) { + c[j * ldc + i] = alpha * dot; + } else { + c[j * ldc + i] = beta * c[j * ldc + i] + alpha * dot; + } + } + } +} + + +static float compute_dot(const float16_t *a, const float16_t *b, int64_t l) { + if ((l&3) != 0) { + return sum(l, [&](int64_t i) -> float { + return float(a[i]) * float(b[i]); + }); + } + float32x4_t rcv = vdupq_n_f32(0); + for (int64_t idx = 0; idx < l; idx += 4) { + float32x4_t aVec = vcvt_f32_f16(vld1_f16(a + idx)); + float32x4_t bVec = vcvt_f32_f16(vld1_f16(b + idx)); + rcv = vaddq_f32(rcv, vmulq_f32(aVec, bVec)); + } + auto sum = vpaddq_f32(rcv, rcv); + return vgetq_lane_f32(vpaddq_f32(sum, sum), 0); +} + +template <> +void gemm_transa_( + TransposeType transa, + int64_t m, int64_t n, int64_t k, + float alpha, + const at::Half *a, int64_t lda, + const at::Half *b, int64_t ldb, + float beta, + at::Half *c, int64_t ldc) { + // c = alpha * (a.T @ b) + beta * c + if (n == 1 && beta == 0.0) { + at::native::blas_impl::fp16_gemv_trans(k, m, alpha, reinterpret_cast(a), lda, reinterpret_cast(b), 1, beta, reinterpret_cast(c), 1); + return; + } + parallel_for(0, m, 1, [&](int64_t begin, int64_t end) { + const auto *a_ = a + begin * lda; + for (const auto i : c10::irange(begin, end)) { + const auto *b_ = b; + for (const auto j : c10::irange(n)) { + const auto dot = compute_dot(reinterpret_cast(a_), reinterpret_cast(b_), k); + b_ += ldb; + if (beta == 0) { + c[j*ldc+i] = alpha*dot; + } else { + c[j*ldc+i] = beta*c[j*ldc+i]+alpha*dot; + } + } + a_ += lda; + } + }); +} + +#endif + template void gemm_core_( TransposeType transa, TransposeType transb, @@ -250,23 +371,22 @@ void gemm_core_( transb == TransposeType::NoTranspose) { return gemm_notrans_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else if ( - transa == TransposeType::Transpose && - transb != TransposeType::Transpose) { - gemm_transa_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + transa != TransposeType::NoTranspose && + transb == TransposeType::NoTranspose) { + gemm_transa_(transa, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } else if ( transa == TransposeType::NoTranspose && - transb == TransposeType::Transpose) { - gemm_transb_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); - } else { // transa == TransposeType::Transpose && transb == - // TransposeType::Transpose - gemm_transab_(m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + transb != TransposeType::NoTranspose) { + gemm_transb_(transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } else { + gemm_transab_(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } } #if !defined(C10_MOBILE) -#define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ - kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \ +#define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6( \ + kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \ TYPE, NAME, __VA_ARGS__) #else #define _AT_DISPATCH_GEMM_TYPES(TYPE, NAME, ...) \ diff --git a/aten/src/ATen/native/cpu/CatKernel.cpp b/aten/src/ATen/native/cpu/CatKernel.cpp index d3a83b2334515..23d9aa1708ba7 100644 --- a/aten/src/ATen/native/cpu/CatKernel.cpp +++ b/aten/src/ATen/native/cpu/CatKernel.cpp @@ -12,11 +12,11 @@ namespace at::native { namespace { struct InputMeta { - void* data_ptr; + const void* data_ptr; int64_t inner_size; InputMeta(const Tensor& t, int64_t dim, int64_t inner) - : data_ptr(t.data_ptr()) + : data_ptr(t.const_data_ptr()) , inner_size(t.sizes()[dim] * inner) {} }; @@ -38,7 +38,7 @@ void cat_serial_kernel_impl(const Tensor& result, const MaterializedITensorListR for (const auto i : c10::irange(outer)) { for (const auto j : c10::irange(ninputs)) { int64_t local_inner = inputs[j].inner_size; - scalar_t* input_ptr = (scalar_t*)(inputs[j].data_ptr) + i * local_inner; + const scalar_t* input_ptr = (const scalar_t*)(inputs[j].data_ptr) + i * local_inner; int64_t d = 0; for (; d < local_inner - (local_inner % Vec::size()); d += Vec::size()) { Vec in_vec = Vec::loadu(input_ptr + d); diff --git a/aten/src/ATen/native/cpu/CopyKernel.cpp b/aten/src/ATen/native/cpu/CopyKernel.cpp index a815896cc799c..906fa8911e884 100644 --- a/aten/src/ATen/native/cpu/CopyKernel.cpp +++ b/aten/src/ATen/native/cpu/CopyKernel.cpp @@ -71,7 +71,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne using Vecs = Vectorized; c10::SmallBuffer ptrs(2); dest_t* output_data = iter.tensor_base(0).data_ptr(); - scalar_t* input_data = iter.tensor_base(1).data_ptr(); + scalar_t* input_data = const_cast(iter.tensor_base(1).const_data_ptr()); ptrs[0] = reinterpret_cast(output_data); ptrs[1] = reinterpret_cast(input_data); @@ -139,7 +139,7 @@ static void reduced_float_copy_kernel(TensorIteratorBase &iter, bool requires_ne using Vecs = Vectorized; c10::SmallBuffer ptrs(2); dest_t* output_data = iter.tensor_base(0).data_ptr(); - source_t* input_data = iter.tensor_base(1).data_ptr(); + source_t* input_data = const_cast(iter.tensor_base(1).const_data_ptr()); ptrs[0] = reinterpret_cast(output_data); ptrs[1] = reinterpret_cast(input_data); diff --git a/aten/src/ATen/native/cpu/CrossKernel.cpp b/aten/src/ATen/native/cpu/CrossKernel.cpp index 0394a9f524abd..d982f63dd0508 100644 --- a/aten/src/ATen/native/cpu/CrossKernel.cpp +++ b/aten/src/ATen/native/cpu/CrossKernel.cpp @@ -21,8 +21,8 @@ static void apply_cross(const Tensor& result, const Tensor& a, const Tensor& b, int64_t b_stride = b.stride(dim); int64_t r_stride = result.stride(dim); - scalar_t *a_ptr = a.data_ptr(); - scalar_t *b_ptr = b.data_ptr(); + const scalar_t *a_ptr = a.const_data_ptr(); + const scalar_t *b_ptr = b.const_data_ptr(); scalar_t *r_ptr = result.data_ptr(); parallel_for(0, total, internal::GRAIN_SIZE, [&](int64_t s, int64_t e) { diff --git a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp index 3a34ad3f7a6e8..573d5de9cde19 100644 --- a/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp +++ b/aten/src/ATen/native/cpu/DepthwiseConvKernel.cpp @@ -292,16 +292,21 @@ Tensor _convolution_depthwise3x3_winograd( bias_potentially_undefined : at::zeros({kernel_sizes[0]}, input.options()); + auto input_data = input.const_data_ptr(); + auto kernel_data = kernel.const_data_ptr(); + auto bias_data = bias.const_data_ptr(); + auto output_data = output.data_ptr(); + at::parallel_for(0, args.batch * args.out_channels, 0, [&](int64_t start, int64_t end) { for (const auto k : c10::irange(start, end)) { const int64_t g = k % args.out_channels; const int64_t i = k / (args.out_channels / groups); convolution_depthwise3x3_winograd_impl( args, - input.data_ptr() + i * input_hxw, - kernel.data_ptr() + g * 3 * 3, - bias.data_ptr() + g, - output.data_ptr() + k * output_hxw); + input_data + i * input_hxw, + kernel_data + g * 3 * 3, + bias_data + g, + output_data + k * output_hxw); } }); diff --git a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp index f2346759cbc09..04d82d365baa3 100644 --- a/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/DistanceOpsKernel.cpp @@ -146,7 +146,7 @@ struct Dist { template static void run_parallel_pdist(Tensor& result, const Tensor& self, const scalar_t p) { - const scalar_t * const self_start = self.data_ptr(); + const scalar_t * const self_start = self.const_data_ptr(); const scalar_t * const self_end = self_start + self.numel(); int64_t n = self.size(0); int64_t m = self.size(1); @@ -203,8 +203,8 @@ struct Dist { template static void run_parallel_cdist(Tensor& result, const Tensor& t1, const Tensor& t2, const scalar_t p) { - const scalar_t * const t1_start = t1.data_ptr(); - const scalar_t * const t2_start = t2.data_ptr(); + const scalar_t * const t1_start = t1.const_data_ptr(); + const scalar_t * const t2_start = t2.const_data_ptr(); int64_t d = t1.size(0); int64_t r1 = t1.size(-2); int64_t r2 = t2.size(-2); @@ -296,14 +296,14 @@ struct Dist { const int64_t m = self.size(1); const int64_t gs = grad.stride(0); - const scalar_t * const grad_start = grad.data_ptr(); - const scalar_t * const dist_start = dist.data_ptr(); - const scalar_t * const self_start = self.data_ptr(); + const scalar_t * const grad_start = grad.const_data_ptr(); + const scalar_t * const dist_start = dist.const_data_ptr(); + const scalar_t * const self_start = self.const_data_ptr(); scalar_t * const res_start = result.data_ptr(); // The only way to parallelize and avoid locking requires parallelizing // over the columns of the input, i.e. we compute the gradient for the - // first section of each vector independentaly of the second section, etc. + // first section of each vector independently of the second section, etc. at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (8 * n * n), [p, n, m, gs, grad_start, dist_start, self_start, res_start](int64_t l, int64_t end) { const Vec pvec(p); @@ -367,10 +367,10 @@ struct Dist { //don't use grad.stride(-1), because if last dimension is 1, stride can be bogus. const int64_t gs = 1; - const scalar_t * const grad_start = grad.data_ptr(); - const scalar_t * const dist_start = dist.data_ptr(); - const scalar_t * const t1_start = t1.data_ptr(); - const scalar_t * const t2_start = t2.data_ptr(); + const scalar_t * const grad_start = grad.const_data_ptr(); + const scalar_t * const dist_start = dist.const_data_ptr(); + const scalar_t * const t1_start = t1.const_data_ptr(); + const scalar_t * const t2_start = t2.const_data_ptr(); scalar_t * const res_start = result.data_ptr(); at::parallel_for(0, m / Vec::size(), internal::GRAIN_SIZE / (16 * r1), [=](int64_t l, int64_t end) { diff --git a/aten/src/ATen/native/cpu/DistributionTemplates.h b/aten/src/ATen/native/cpu/DistributionTemplates.h index 1a1039b916f8e..93a9b33b29285 100644 --- a/aten/src/ATen/native/cpu/DistributionTemplates.h +++ b/aten/src/ATen/native/cpu/DistributionTemplates.h @@ -15,7 +15,6 @@ #include #endif - namespace at { namespace native { namespace templates { @@ -149,6 +148,62 @@ static void normal_fill_16(scalar_t *data, const scalar_t mean, const scalar_t s } } +#if defined(__VSX__) || defined(CPU_CAPABILITY_VSX) +static void normal_fill_16_VSX(float *data,const Vectorized &two_pi,const Vectorized &one,const Vectorized &minus_two,const Vectorized &mean,const Vectorized &std) { + using Vec = Vectorized; + Vec u1=one-Vec::loadu(data); + Vec u2=Vec::loadu(data+8); + Vec radius=(minus_two * u1.log()); + radius=radius.sqrt(); + Vec theta=two_pi * u2; + Vec output_vec=radius * theta.cos() * std + mean; + Vec output_vec2=radius * theta.sin() * std + mean; + output_vec.store(data); + output_vec2.store(data+8); +} + +template +void normal_fill_VSX(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { + float *data = self.data_ptr(); + auto size = self.numel(); + std::lock_guard lock(generator->mutex_); + for (const auto i : c10::irange(size)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + + using Vec = Vectorized; + const Vec two_pi = Vec(2.0f * c10::pi); + const Vec one = Vec(1.0f); + const Vec minus_two = Vec(-2.0f); + const Vec var_vec = Vec(std); + const Vec mean_vec = Vec(mean); + + for (int64_t i = 0; i < size - 15; i += 16) { + if(Vec::size()==8) { + normal_fill_16_VSX(data + i, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data + i, mean, std); + } + } + if (size % 16 != 0) { + // Recompute the last 16 values. + data = data + size - 16; + for (const auto i : c10::irange(16)) { + at::uniform_real_distribution uniform(0, 1); + data[i] = uniform(generator); + } + if(Vec::size()==8){ + normal_fill_16_VSX(data, two_pi, one, minus_two, mean_vec, var_vec); + } + else{ + normal_fill_16(data, mean, std); + } + } +} +#endif //VSX + template void normal_fill(const TensorBase &self, const scalar_t mean, const scalar_t std, RNG generator) { scalar_t *data = self.data_ptr(); @@ -179,6 +234,8 @@ void normal_kernel(const TensorBase &self, double mean, double std, RNG generato if (self.scalar_type() == ScalarType::Float && size >= 16 && self.is_contiguous()) { #ifdef CPU_CAPABILITY_AVX2 normal_fill_AVX2(self, static_cast(mean), static_cast(std), generator); +#elif defined(__VSX__) || defined(CPU_CAPABILITY_VSX) + normal_fill_VSX(self, static_cast(mean), static_cast(std), generator); #else normal_fill(self, static_cast(mean), static_cast(std), generator); #endif @@ -321,7 +378,7 @@ void bernoulli_kernel(const TensorBase &self, const TensorBase &p_, RNG generato auto p = expand_inplace(self, p_cpu); auto iter = TensorIteratorConfig() .add_output(self) - .add_input(*p) + .add_const_input(*p) .check_all_same_dtype(false) .build(); if (p->scalar_type() == kDouble) { diff --git a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp index e58ebd17c255a..cb96f24ebdde6 100644 --- a/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp +++ b/aten/src/ATen/native/cpu/FlashAttentionKernel.cpp @@ -23,25 +23,25 @@ namespace { // 1) out = exp(a - val) // 2) val = sum(out) -template +template inline void _exp_reduce_sum_fusion_kernel( - scalar_t* a, + T1* a, const int& size, - scalar_t* out, - scalar_t& val) { - auto vec_size = vec::Vectorized::size(); - auto vec_max = vec::Vectorized(val); - scalar_t tmp_sum = 0; - auto vec_tmp_sum = vec::Vectorized(tmp_sum); + T2* out, + T1& val) { + auto vec_size = vec::Vectorized::size(); + auto vec_max = vec::Vectorized(val); + T1 tmp_sum = 0; + auto vec_tmp_sum = vec::Vectorized(tmp_sum); for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { - auto tmp0 = vec::Vectorized::loadu(a + i); + auto tmp0 = vec::Vectorized::loadu(a + i); auto tmp1 = tmp0 - vec_max; auto tmp2 = tmp1.exp_u20(); vec_tmp_sum += tmp2; _store(out + i, tmp2); } - tmp_sum = vec::vec_reduce_all( - [](vec::Vectorized& x, vec::Vectorized& y) { + tmp_sum = vec::vec_reduce_all( + [](vec::Vectorized& x, vec::Vectorized& y) { return x + y; }, vec_tmp_sum); @@ -55,27 +55,6 @@ inline void _exp_reduce_sum_fusion_kernel( val = tmp_sum; } -// out = a / sum -template -inline void _normalization_kernel( - const T1* a, - const T1& sum, - const int& size, - T2* out) { - auto vec_size = vec::Vectorized::size(); - auto vec_sum = vec::Vectorized(sum); - for (long i = 0; i < vec_size * (size / vec_size); i += vec_size) { - auto tmp0 = vec::Vectorized::loadu(a + i); - auto tmp1 = tmp0 / vec_sum; - _store(out + i, tmp1); - } - for (long i = vec_size * (size / vec_size); i < size; i++) { - auto tmp0 = a[i]; - auto tmp1 = tmp0 / sum; - out[i] = tmp1; - } -} - // 1) out = a * scale // 2) max = max(out) template @@ -112,7 +91,7 @@ inline void _mul_reduce_max_fusion_kernel( template static inline scalar_t* conditional_data_ptr(scalar_t* ptr, scalar_t* ptr2) { - TORCH_INTERNAL_ASSERT(ptr2 == nullptr); + TORCH_CHECK(ptr2 == nullptr); return ptr; } @@ -253,11 +232,11 @@ void cpu_flash_attention( at::Tensor buf_reduced = at::empty({num_thread, qSplitSize, is_reduced_type ? kvSplitSize : 0}, query.options()); // Data ptrs - scalar_t* q_data = query.data_ptr(); - scalar_t* k_data = key.data_ptr(); - scalar_t* v_data = value.data_ptr(); - accum_t* mask_data = has_attn_mask - ? attn_mask.value().data_ptr() + const scalar_t* q_data = query.const_data_ptr(); + const scalar_t* k_data = key.const_data_ptr(); + const scalar_t* v_data = value.const_data_ptr(); + const accum_t* mask_data = has_attn_mask + ? attn_mask.value().const_data_ptr() : nullptr; scalar_t* out_data = output.data_ptr(); accum_t* lse_data = logsumexp.data_ptr(); @@ -331,9 +310,8 @@ void cpu_flash_attention( } } // Update coefficients with Softmax - accum_t tmp_max = 0, tmp_sum = 0, sum_old = 0, exp_tmp = 0; + accum_t tmp_max = 0, tmp_sum = 0, exp_tmp = 0; for (int64_t row = 0; row < qBlockSize; ++row) { - sum_old = qk_sum_data[row]; if (has_attn_mask) { // max per row tmp_max = at::vec::reduce_all( @@ -352,23 +330,20 @@ void cpu_flash_attention( tmp_max = qk_max_data[row] > tmp_max ? qk_max_data[row] : tmp_max; // qk <- exp(qk - max) and sum per row tmp_sum = tmp_max; - _exp_reduce_sum_fusion_kernel(qk_data + row * kvBlockSize, kvBlockSize, qk_data + row * kvBlockSize, tmp_sum); + _exp_reduce_sum_fusion_kernel( + qk_data + row * kvBlockSize, kvBlockSize, + conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize, + tmp_sum); // exp_tmp <- exp(max[row] - max) exp_tmp = std::exp(qk_max_data[row] - tmp_max); // sum[row] <- sum + exp_tmp * sum[row] qk_sum_data[row] = tmp_sum + exp_tmp * qk_sum_data[row]; // max[row] <- max qk_max_data[row] = tmp_max; - // qk <- qk / sum[row] - accum_t sum_new = qk_sum_data[row]; - _normalization_kernel(qk_data + row * kvBlockSize, sum_new, kvBlockSize, - conditional_data_ptr(qk_data, qk_reduced_data) + row * kvBlockSize); - // dst <- dst * sum_old / sum_new * exp_tmp + // dst <- dst * exp_tmp if (n > 0) { - accum_t sum_cor = sum_old / sum_new; vec::map( - [sum_cor, exp_tmp](Vec x) - { return x * Vec(sum_cor) * Vec(exp_tmp); }, + [exp_tmp](Vec x) { return x * Vec(exp_tmp); }, dst_data + row * headSize, dst_data + row * headSize, headSize); } } @@ -389,10 +364,12 @@ void cpu_flash_attention( dst_data, headSize); } + // dst <- dst / sum[row] // reorder MHA output with strides for (int64_t row = 0; row < qBlockSize; ++row) { + accum_t sum_reciprocal = 1 / qk_sum_data[row]; vec::map( - [](Vec x) { return x; }, + [sum_reciprocal](Vec x) { return x * Vec(sum_reciprocal); }, out_data + i * oStrideB + j * oStrideH + m * oStrideM + row * oStrideM, dst_data + row * headSize, headSize); @@ -516,15 +493,15 @@ void cpu_flash_attention_backward( scalar_t* grad_q_data = grad_q.data_ptr(); scalar_t* grad_k_data = grad_k.data_ptr(); scalar_t* grad_v_data = grad_v.data_ptr(); - scalar_t* grad_out_data = grad_out.data_ptr(); - scalar_t* q_data = query.data_ptr(); - scalar_t* k_data = key.data_ptr(); - scalar_t* v_data = value.data_ptr(); - accum_t* mask_data = has_attn_mask - ? attn_mask.value().data_ptr() + const scalar_t* grad_out_data = grad_out.const_data_ptr(); + const scalar_t* q_data = query.const_data_ptr(); + const scalar_t* k_data = key.const_data_ptr(); + const scalar_t* v_data = value.const_data_ptr(); + const accum_t* mask_data = has_attn_mask + ? attn_mask.value().const_data_ptr() : nullptr; - scalar_t* out_data = out.data_ptr(); - accum_t* lse_data = logsumexp.data_ptr(); + const scalar_t* out_data = out.const_data_ptr(); + const accum_t* lse_data = logsumexp.const_data_ptr(); accum_t* buf_data = buf.data_ptr(); scalar_t* buf_reduced_data = is_reduced_type ? buf_reduced.data_ptr() : nullptr; @@ -721,7 +698,7 @@ void flash_attention_kernel_impl( c10::optional scale) { auto q_seq_len = query.size(2); - AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, query.scalar_type(), "flash_attention", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention", [&] { if (q_seq_len >= 768) { cpu_flash_attention( output, logsumexp, query, key, value, @@ -758,7 +735,7 @@ void flash_attention_backward_kernel_impl( auto grad_out_contig = grad_out.contiguous(); auto q_seq_len = query.size(1); - AT_DISPATCH_FLOATING_TYPES_AND(kBFloat16, query.scalar_type(), "flash_attention_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, query.scalar_type(), "flash_attention_backward", [&] { if (q_seq_len >= 768) { cpu_flash_attention_backward( grad_q, grad_k, grad_v, grad_out_contig, diff --git a/aten/src/ATen/native/cpu/FusedAdamKernel.cpp b/aten/src/ATen/native/cpu/FusedAdamKernel.cpp new file mode 100644 index 0000000000000..4a10fe202c4a0 --- /dev/null +++ b/aten/src/ATen/native/cpu/FusedAdamKernel.cpp @@ -0,0 +1,379 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +namespace at::native { + +namespace{ + +template +typename std::enable_if< + std::is_same::value || std::is_same::value, + void>:: + type inline adam_math( + scalar_t* param_ptr, + scalar_t* exp_avg_ptr, + scalar_t* exp_avg_sq_ptr, + scalar_t* grad_ptr, + scalar_t* max_exp_avg_sq_ptr, + double lr, + double bias_correction1, + double bias_correction2, + double exp_avg_grad_coefficient, + double exp_avg_sq_grad_coefficient, + double bias_correction2_sqrt, + double eps, + double weight_decay, + double beta2, + bool amsgrad, + bool maximize, + const float* grad_scale_ptr, + int64_t size +){ + double step_size = lr / bias_correction1; + using lpVec = at::vec::Vectorized; + using fVec = at::vec::Vectorized; + lpVec grad_vec_to_store; + int64_t d = 0; + fVec param_vec1, param_vec2; + fVec grad_vec1, grad_vec2; + fVec exp_avg_vec1, exp_avg_vec2; + fVec exp_avg_sq_vec1, exp_avg_sq_vec2; + fVec max_exp_avg_sq_vec1, max_exp_avg_sq_vec2; + for (; d < size - (size % lpVec::size()); d += lpVec::size()) { + lpVec param_lpvec = lpVec::loadu(param_ptr + d); + std::tie(param_vec1, param_vec2) = vec::convert_to_float(param_lpvec); + lpVec grad_lpvec = lpVec::loadu(grad_ptr + d); + std::tie(grad_vec1, grad_vec2) = vec::convert_to_float(grad_lpvec); + if (grad_scale_ptr) { + grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr)); + grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr)); + grad_vec_to_store = vec::convert_from_float(grad_vec1, grad_vec2); + grad_vec_to_store.store(grad_ptr + d); + } + if (maximize){ + grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0)); + grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0)); + } + if (weight_decay != 0.f){ + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad_vec1 += param_vec1 * fVec(opmath_t(weight_decay)); + grad_vec2 += param_vec2 * fVec(opmath_t(weight_decay)); + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param_vec1 = param_vec1 * fVec(opmath_t(1 - lr * weight_decay)); + param_vec2 = param_vec2 * fVec(opmath_t(1 - lr * weight_decay)); + } + } + + lpVec exp_avg_lpvec = lpVec::loadu(exp_avg_ptr + d); + std::tie(exp_avg_vec1, exp_avg_vec2) = vec::convert_to_float(exp_avg_lpvec); + + // exp_avg.lerp_(grad, 1 - beta1) + const fVec lerp_weight = fVec(opmath_t(exp_avg_grad_coefficient)); + auto mask = lerp_weight.abs() < fVec(0.5); + auto coeff = fVec::blendv(lerp_weight - fVec(1), lerp_weight, mask); + + auto base1 = fVec::blendv(grad_vec1, exp_avg_vec1, mask); + exp_avg_vec1 = vec::fmadd(coeff, grad_vec1 - exp_avg_vec1, base1); + + auto base2 = fVec::blendv(grad_vec2, exp_avg_vec2, mask); + exp_avg_vec2 = vec::fmadd(coeff, grad_vec2 - exp_avg_vec2, base2); + + lpVec exp_avg_sq_lpvec = lpVec::loadu(exp_avg_sq_ptr + d); + std::tie(exp_avg_sq_vec1, exp_avg_sq_vec2) = vec::convert_to_float(exp_avg_sq_lpvec); + exp_avg_sq_vec1 = exp_avg_sq_vec1 * fVec(opmath_t(beta2)) + + fVec(opmath_t(exp_avg_sq_grad_coefficient)) * grad_vec1 * grad_vec1; + exp_avg_sq_vec2 = exp_avg_sq_vec2 * fVec(opmath_t(beta2)) + + fVec(opmath_t(exp_avg_sq_grad_coefficient)) * grad_vec2 * grad_vec2; + + vec::convert_from_float(exp_avg_vec1, exp_avg_vec2).store(exp_avg_ptr + d); + vec::convert_from_float(exp_avg_sq_vec1, exp_avg_sq_vec2).store(exp_avg_sq_ptr + d); + + fVec denom_vec1, denom_vec2; + if (amsgrad) { + lpVec max_exp_avg_sq_lpvec = lpVec::loadu(max_exp_avg_sq_ptr + d); + std::tie(max_exp_avg_sq_vec1, max_exp_avg_sq_vec2) = vec::convert_to_float(max_exp_avg_sq_lpvec); + max_exp_avg_sq_vec1 = maximum(max_exp_avg_sq_vec1, exp_avg_sq_vec1); + max_exp_avg_sq_vec2 = maximum(max_exp_avg_sq_vec2, exp_avg_sq_vec2); + vec::convert_from_float(max_exp_avg_sq_vec1, max_exp_avg_sq_vec2).store(max_exp_avg_sq_ptr + d); + denom_vec1 = + (max_exp_avg_sq_vec1.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps)); + denom_vec2 = + (max_exp_avg_sq_vec2.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps)); + } else { + denom_vec1 = + (exp_avg_sq_vec1.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps)); + denom_vec2 = + (exp_avg_sq_vec2.sqrt() / fVec(opmath_t(bias_correction2_sqrt))) + fVec(opmath_t(eps)); + } + param_vec1 = param_vec1 + fVec(opmath_t(-step_size)) * exp_avg_vec1 / denom_vec1; + param_vec2 = param_vec2 + fVec(opmath_t(-step_size)) * exp_avg_vec2 / denom_vec2; + vec::convert_from_float(param_vec1, param_vec2).store(param_ptr + d); + } + scalar_t grad_val_to_store; + for (; d < size; d++) { + opmath_t grad_val = grad_ptr[d]; + opmath_t param_val = param_ptr[d]; + if (grad_scale_ptr) { + grad_val = grad_ptr[d] / float(*grad_scale_ptr); + grad_val_to_store = scalar_t(grad_val); + grad_ptr[d] = grad_val_to_store; + } + if (maximize) grad_val = -grad_val; + if (weight_decay != 0.f){ + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad_val += param_val * opmath_t(weight_decay); + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param_val = param_val * opmath_t(1 - lr * weight_decay); + } + } + // exp_avg.lerp_(grad, 1 - beta1) + opmath_t exp_avg_var = exp_avg_ptr[d]; + auto is_lerp_weight_small = std::abs(opmath_t(exp_avg_grad_coefficient)) < opmath_t(0.5); + if (is_lerp_weight_small) { + exp_avg_var = exp_avg_var + opmath_t(exp_avg_grad_coefficient) * (grad_val - exp_avg_var); + } else { + exp_avg_var = grad_val - (grad_val - exp_avg_var) * (opmath_t(1) - opmath_t(exp_avg_grad_coefficient)); + } + exp_avg_ptr[d] = scalar_t(exp_avg_var); + opmath_t exp_avg_sq_var = exp_avg_sq_ptr[d]; + exp_avg_sq_var = exp_avg_sq_var * opmath_t(beta2); + exp_avg_sq_var = exp_avg_sq_var + + opmath_t(exp_avg_sq_grad_coefficient) * grad_val * grad_val; + exp_avg_sq_ptr[d] = scalar_t(exp_avg_sq_var); + opmath_t demon_val; + if (amsgrad) { + opmath_t max_exp_avg_sq_var = max_exp_avg_sq_ptr[d]; + max_exp_avg_sq_var = std::max(max_exp_avg_sq_var, exp_avg_sq_var); + max_exp_avg_sq_ptr[d] = + scalar_t(max_exp_avg_sq_var); + demon_val = + std::sqrt(max_exp_avg_sq_var) / opmath_t(bias_correction2_sqrt) + opmath_t(eps); + } else { + demon_val = std::sqrt(exp_avg_sq_var) / opmath_t(bias_correction2_sqrt) + opmath_t(eps); + } + param_ptr[d] = param_val - opmath_t(step_size) * exp_avg_var / demon_val; + } +} + + +template +typename std::enable_if< + std::is_same::value || std::is_same::value, + void>:: + type inline adam_math( + scalar_t* param_ptr, + scalar_t* exp_avg_ptr, + scalar_t* exp_avg_sq_ptr, + scalar_t* grad_ptr, + scalar_t* max_exp_avg_sq_ptr, + double lr, + double bias_correction1, + double bias_correction2, + double exp_avg_grad_coefficient, + double exp_avg_sq_grad_coefficient, + double bias_correction2_sqrt, + double eps, + double weight_decay, + double beta2, + bool amsgrad, + bool maximize, + const float* grad_scale_ptr, + int64_t size +){ + double step_size = lr / bias_correction1; + using Vec = at::vec::Vectorized; + Vec grad_vec_to_store; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec param_vec = Vec::loadu(param_ptr + d); + Vec grad_vec = Vec::loadu(grad_ptr + d); + if (grad_scale_ptr) { + grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr)); + grad_vec_to_store = grad_vec; + grad_vec_to_store.store(grad_ptr + d); + } + if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0)); + if (weight_decay != 0.f){ + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad_vec += param_vec * Vec(scalar_t(weight_decay)); + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param_vec = param_vec * Vec(scalar_t(1 - lr * weight_decay)); + } + } + Vec exp_avg_vec = Vec::loadu(exp_avg_ptr + d); + // exp_avg.lerp_(grad, 1 - beta1) + const Vec lerp_weight = Vec(scalar_t(exp_avg_grad_coefficient)); + auto mask = lerp_weight.abs() < Vec(0.5); + auto coeff = Vec::blendv(lerp_weight - Vec(1), lerp_weight, mask); + auto base = Vec::blendv(grad_vec, exp_avg_vec, mask); + exp_avg_vec = vec::fmadd(coeff, grad_vec - exp_avg_vec, base); + + Vec exp_avg_sq_vec = Vec::loadu(exp_avg_sq_ptr + d) * Vec(scalar_t(beta2)) + + Vec(scalar_t(exp_avg_sq_grad_coefficient)) * grad_vec * grad_vec; + exp_avg_vec.store(exp_avg_ptr + d); + exp_avg_sq_vec.store(exp_avg_sq_ptr + d); + + Vec denom_vec; + if (amsgrad) { + Vec max_exp_avg_sq_vec = + maximum(Vec::loadu(max_exp_avg_sq_ptr + d), exp_avg_sq_vec); + max_exp_avg_sq_vec.store(max_exp_avg_sq_ptr + d); + denom_vec = + (max_exp_avg_sq_vec.sqrt() / Vec(scalar_t(bias_correction2_sqrt))) + Vec(scalar_t(eps)); + } else { + denom_vec = + (exp_avg_sq_vec.sqrt() / Vec(scalar_t(bias_correction2_sqrt))) + Vec(scalar_t(eps)); + } + param_vec = param_vec + Vec(scalar_t(-step_size)) * exp_avg_vec / denom_vec; + param_vec.store(param_ptr + d); + } + scalar_t grad_val_to_store; + for (; d < size; d++) { + scalar_t grad_val = grad_ptr[d]; + if (grad_scale_ptr) { + grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr); + grad_val_to_store = grad_val; + grad_ptr[d] = grad_val_to_store; + } + if (maximize) grad_val = -grad_val; + if (weight_decay != 0.f){ + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad_val += param_ptr[d] * scalar_t(weight_decay); + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param_ptr[d] = param_ptr[d] * scalar_t(1 - lr * weight_decay); + } + } + // exp_avg.lerp_(grad, 1 - beta1) + auto is_lerp_weight_small = std::abs(scalar_t(exp_avg_grad_coefficient)) < scalar_t(0.5); + if (is_lerp_weight_small) { + exp_avg_ptr[d] = exp_avg_ptr[d] + scalar_t(exp_avg_grad_coefficient) * (grad_val - exp_avg_ptr[d]); + } else { + exp_avg_ptr[d] = grad_val - (grad_val - exp_avg_ptr[d]) * (scalar_t(1) - scalar_t(exp_avg_grad_coefficient)); + } + exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] * scalar_t(beta2); + exp_avg_sq_ptr[d] = exp_avg_sq_ptr[d] + + scalar_t(exp_avg_sq_grad_coefficient) * grad_val * grad_val; + scalar_t demon_val; + if (amsgrad) { + max_exp_avg_sq_ptr[d] = + std::max(max_exp_avg_sq_ptr[d], exp_avg_sq_ptr[d]); + demon_val = + std::sqrt(max_exp_avg_sq_ptr[d]) / scalar_t(bias_correction2_sqrt) + scalar_t(eps); + } else { + demon_val = std::sqrt(exp_avg_sq_ptr[d]) / scalar_t(bias_correction2_sqrt) + scalar_t(eps); + } + param_ptr[d] = param_ptr[d] - scalar_t(step_size) * exp_avg_ptr[d] / demon_val; + } +} + + +template +void adam_fused_step_impl( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& exp_avg, + const at::Tensor& exp_avg_sq, + const at::Tensor& max_exp_avg_sq, + const at::Tensor& state_step, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const float* grad_scale_ptr) { + using opmath_t = at::opmath_type; + double step = state_step.item(); + scalar_t* param_data = param.data_ptr(); + scalar_t* exp_avg_data = exp_avg.data_ptr(); + scalar_t* exp_avg_sq_data = exp_avg_sq.data_ptr(); + scalar_t* max_exp_avg_sq_data = amsgrad ? max_exp_avg_sq.data_ptr() : nullptr; + scalar_t* grad_data = grad.data_ptr(); + + // need to use double here to align with non-fused adam + double bias_correction1 = 1 - std::pow(beta1, step); + double bias_correction2 = 1 - std::pow(beta2, step); + double exp_avg_grad_coefficient = 1 - beta1; + double exp_avg_sq_grad_coefficient = 1 - beta2; + double bias_correction2_sqrt = std::sqrt(bias_correction2); + + + constexpr size_t cache_line_size = 64; + constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t); + size_t num_units = divup(param.numel(), cache_line_aligned_task_unit); + + auto adam_fn = [&](int64_t begin, int64_t end) { + // local pointers + begin *= cache_line_aligned_task_unit; + end = std::min(end * cache_line_aligned_task_unit, param.numel()); + scalar_t* param_ptr = param_data + begin; + scalar_t* exp_avg_ptr = exp_avg_data + begin; + scalar_t* exp_avg_sq_ptr = exp_avg_sq_data + begin; + scalar_t* grad_ptr = grad_data + begin; + scalar_t* max_exp_avg_sq_ptr = amsgrad ? max_exp_avg_sq_data + begin : nullptr; + + const int64_t size = end - begin; + adam_math( + param_ptr, + exp_avg_ptr, + exp_avg_sq_ptr, + grad_ptr, + max_exp_avg_sq_ptr, + lr, + bias_correction1, + bias_correction2, + exp_avg_grad_coefficient, + exp_avg_sq_grad_coefficient, + bias_correction2_sqrt, + eps, + weight_decay, + beta2, + amsgrad, + maximize, + grad_scale_ptr, + size + ); + }; + at::parallel_for( + 0, num_units, 0, adam_fn); +} + +void fused_adam_kernel( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& exp_avg, + const at::Tensor& exp_avg_sq, + const at::Tensor& max_exp_avg_sq, + const at::Tensor& state_step, + const double lr, + const double beta1, + const double beta2, + const double weight_decay, + const double eps, + const bool amsgrad, + const bool maximize, + const float* grad_scale_ptr, + const ADAM_MODE adam_mode + ) { + Tensor grad_contiguous = grad.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_adam_kernel", [&] { + if(adam_mode == ADAM_MODE::ORIGINAL){ + adam_fused_step_impl(param, grad, exp_avg, exp_avg_sq, max_exp_avg_sq, state_step, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale_ptr); + } else { + adam_fused_step_impl(param, grad, exp_avg, exp_avg_sq, max_exp_avg_sq, state_step, lr, beta1, beta2, weight_decay, eps, amsgrad, maximize, grad_scale_ptr); + } + + }); +} + +} + +REGISTER_DISPATCH(fused_adam_stub, &fused_adam_kernel); +} // namespace at::native diff --git a/aten/src/ATen/native/cpu/FusedSGDKernel.cpp b/aten/src/ATen/native/cpu/FusedSGDKernel.cpp new file mode 100644 index 0000000000000..c19aa249a1594 --- /dev/null +++ b/aten/src/ATen/native/cpu/FusedSGDKernel.cpp @@ -0,0 +1,278 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include +#include +namespace at::native { + +namespace{ + +template +typename std::enable_if< + std::is_same::value || std::is_same::value, + void>:: + type inline sgd_math( + scalar_t* param_ptr, + scalar_t* grad_ptr, + scalar_t* momentum_buf_ptr, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr, + int64_t size +){ + using lpVec = at::vec::Vectorized; + using fVec = at::vec::Vectorized; + lpVec grad_vec_to_store; + fVec param_vec1, param_vec2; + fVec grad_vec1, grad_vec2; + fVec momentum_buffer_vec1, momentum_buffer_vec2; + int64_t d = 0; + for (; d < size - (size % lpVec::size()); d += lpVec::size()) { + lpVec param_lpvec = lpVec::loadu(param_ptr + d); + std::tie(param_vec1, param_vec2) = vec::convert_to_float(param_lpvec); + lpVec grad_lpvec = lpVec::loadu(grad_ptr + d); + std::tie(grad_vec1, grad_vec2) = vec::convert_to_float(grad_lpvec); + if (grad_scale_ptr) { + grad_vec1 = grad_vec1 / fVec(float(*grad_scale_ptr)); + grad_vec2 = grad_vec2 / fVec(float(*grad_scale_ptr)); + grad_vec_to_store = vec::convert_from_float(grad_vec1, grad_vec2); + grad_vec_to_store.store(grad_ptr + d); + } + if (maximize){ + grad_vec1 = grad_vec1 * fVec(opmath_t(-1.0)); + grad_vec2 = grad_vec2 * fVec(opmath_t(-1.0)); + } + if (weight_decay != 0.0){ + grad_vec1 = vec::fmadd(param_vec1, fVec(scalar_t(weight_decay)), grad_vec1); + grad_vec2 = vec::fmadd(param_vec2, fVec(scalar_t(weight_decay)), grad_vec2); + } + if (momentum != 0.0) { + fVec momentum_vec1, momentum_vec2; + if (is_first_step) { + momentum_vec1 = grad_vec1; + momentum_vec2 = grad_vec2; + } else { + + momentum_vec1 = fVec::loadu(momentum_buf_ptr + d) * fVec(scalar_t(momentum)); + momentum_vec2 = fVec::loadu(momentum_buf_ptr + d + fVec::size()) * fVec(scalar_t(momentum)); + momentum_vec1 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec1, momentum_vec1); + momentum_vec2 = vec::fmadd(fVec(scalar_t(1 - dampening)), grad_vec2, momentum_vec2); + } + vec::convert_from_float(momentum_vec1, momentum_vec2).store(momentum_buf_ptr + d);; + if (nesterov) { + grad_vec1 = vec::fmadd(momentum_vec1, fVec(scalar_t(momentum)), grad_vec1); + grad_vec2 = vec::fmadd(momentum_vec2, fVec(scalar_t(momentum)), grad_vec2); + } else { + grad_vec1 = momentum_vec1; + grad_vec2 = momentum_vec2; + } + } + } + scalar_t grad_val_to_store; + for (; d < size; d++) { + opmath_t grad_val = grad_ptr[d]; + opmath_t param_val = param_ptr[d]; + if (grad_scale_ptr) { + grad_val = grad_ptr[d] / opmath_t(*grad_scale_ptr); + grad_val_to_store = grad_val; + grad_ptr[d] = grad_val_to_store; + } + if (maximize) grad_val = -grad_val; + if (weight_decay != 0.0){ + grad_val += param_val * opmath_t(weight_decay); + } + if (momentum != 0.0) { + opmath_t momentum_buf_var = momentum_buf_ptr[d]; + if (is_first_step) { + momentum_buf_var = grad_val; + } else { + momentum_buf_var = momentum_buf_var * opmath_t(momentum) + + grad_val * opmath_t(1 - dampening); + } + momentum_buf_ptr[d] = momentum_buf_var; + if (nesterov) { + grad_val += momentum_buf_var * opmath_t(momentum); + } else { + grad_val = momentum_buf_var; + } + } + param_ptr[d] = param_val - grad_val * opmath_t(lr); + } +} + + +template +typename std::enable_if< + std::is_same::value || std::is_same::value, + void>:: + type inline sgd_math( + scalar_t* param_ptr, + scalar_t* grad_ptr, + scalar_t* momentum_buf_ptr, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr, + int64_t size +){ + using Vec = at::vec::Vectorized; + Vec grad_vec_to_store; + int64_t d = 0; + for (; d < size - (size % Vec::size()); d += Vec::size()) { + Vec param_vec = Vec::loadu(param_ptr + d); + Vec grad_vec = Vec::loadu(grad_ptr + d); + if (grad_scale_ptr) { + grad_vec = grad_vec / Vec(scalar_t(*grad_scale_ptr)); + grad_vec_to_store = grad_vec; + grad_vec_to_store.store(grad_ptr + d); + } + if (maximize) grad_vec = grad_vec * Vec(scalar_t(-1.0)); + if (weight_decay != 0.0){ + grad_vec = vec::fmadd(param_vec, Vec(scalar_t(weight_decay)), grad_vec); + } + if (momentum != 0.0) { + Vec momentum_vec; + if (is_first_step) { + momentum_vec = grad_vec; + } else { + momentum_vec = + Vec::loadu(momentum_buf_ptr + d) * Vec(scalar_t(momentum)); + momentum_vec = vec::fmadd(Vec(scalar_t(1 - dampening)), grad_vec, momentum_vec); + } + momentum_vec.store(momentum_buf_ptr + d); + if (nesterov) { + grad_vec = vec::fmadd(momentum_vec, Vec(scalar_t(momentum)), grad_vec); + } else { + grad_vec = momentum_vec; + } + } + param_vec += grad_vec * Vec(scalar_t(-lr)); + param_vec.store(param_ptr + d); + } + scalar_t grad_val_to_store; + for (; d < size; d++) { + scalar_t grad_val = grad_ptr[d]; + if (grad_scale_ptr) { + grad_val = grad_ptr[d] / scalar_t(*grad_scale_ptr); + grad_val_to_store = grad_val; + grad_ptr[d] = grad_val_to_store; + } + if (maximize) grad_val = -grad_val; + if (weight_decay != 0.0){ + grad_val += param_ptr[d] * scalar_t(weight_decay); + } + if (momentum != 0.0) { + if (is_first_step) { + momentum_buf_ptr[d] = grad_val; + } else { + momentum_buf_ptr[d] = momentum_buf_ptr[d] * scalar_t(momentum) + + grad_val * scalar_t(1 - dampening); + } + if (nesterov) { + grad_val += momentum_buf_ptr[d] * scalar_t(momentum); + } else { + grad_val = momentum_buf_ptr[d]; + } + } + param_ptr[d] -= grad_val * scalar_t(lr); + } +} + +template +void sgd_fused_step_impl( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& momentum_buffer, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr) { + using opmath_t = at::opmath_type; + scalar_t* param_data = param.data_ptr(); + scalar_t* grad_data = grad.data_ptr(); + bool has_momentum_buffer = momentum != 0.0; + scalar_t* momentum_buffer_data = has_momentum_buffer ? momentum_buffer.data_ptr() : nullptr; + + constexpr size_t cache_line_size = 64; + constexpr int64_t cache_line_aligned_task_unit = cache_line_size / sizeof(scalar_t); + size_t num_units = divup(param.numel(), cache_line_aligned_task_unit); + + auto sgd_fn = [&](int64_t begin, int64_t end) { + // local pointers + begin *= cache_line_aligned_task_unit; + end = std::min(end * cache_line_aligned_task_unit, param.numel()); + scalar_t* param_ptr = param_data + begin; + scalar_t* grad_ptr = grad_data + begin; + scalar_t* momentum_buffer_ptr = has_momentum_buffer ? momentum_buffer_data + begin : nullptr; + + const int64_t size = end - begin; + sgd_math( + param_ptr, + grad_ptr, + momentum_buffer_ptr, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr, + size + ); + }; + at::parallel_for( + 0, num_units, 0, sgd_fn); +} + +void fused_sgd_kernel( + const at::Tensor& param, + const at::Tensor& grad, + const at::Tensor& momentum_buffer, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr + ) { + Tensor grad_contiguous = grad.contiguous(); + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, param.scalar_type(), "fused_sgd_kernel", [&] { + sgd_fused_step_impl( + param, + grad, + momentum_buffer, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr); + }); +} + +} + +REGISTER_DISPATCH(fused_sgd_stub, &fused_sgd_kernel); +} // namespace at::native diff --git a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp index 5c02472be592a..0a704e5419c4f 100644 --- a/aten/src/ATen/native/cpu/GridSamplerKernel.cpp +++ b/aten/src/ATen/native/cpu/GridSamplerKernel.cpp @@ -7,12 +7,10 @@ #include #include #include -#include #include #include #include -#include namespace at::native { namespace { @@ -110,7 +108,7 @@ namespace at::native { namespace { * // 3. writes the first `len` values in the interpolated vector to * // `out_slice` with spatial offset being `offset`. * // - * // This assimes that `grid_x` and `grid_y` all contain valid grid + * // This assumes that `grid_x` and `grid_y` all contain valid grid * // values \in [-1, 1], even at indices greater than `len`. * // * // The `*_slice` argument names mean samples within a batch (i.e., @@ -391,8 +389,7 @@ struct ComputeLocation } inline std::pair apply_get_grad(const Vec &in) const { - Vec res, grad_clip; - std::tie(res, grad_clip) = clip_coordinates_get_grad(unnormalize(in)); + auto [res, grad_clip] = clip_coordinates_get_grad(unnormalize(in)); return std::make_pair(res, grad_clip & Vec(scaling_factor)); } }; @@ -423,8 +420,8 @@ struct ComputeLocation } inline std::pair apply_get_grad(const Vec &in) const { - Vec res, grad_refl, grad_clip, grad(scaling_factor); - std::tie(res, grad_refl) = reflect_coordinates_get_grad(unnormalize(in)); + auto [res, grad_refl] = reflect_coordinates_get_grad(unnormalize(in)); + Vec grad_clip, grad(scaling_factor); grad = grad_refl * grad; std::tie(res, grad_clip) = clip_coordinates_get_grad(res); grad = grad_clip & grad; @@ -475,7 +472,7 @@ struct ApplyGridSample compute_W; const bool must_in_bound = padding != GridSamplerPadding::Zeros; - ApplyGridSample(const TensorAccessor& input) + ApplyGridSample(const TensorAccessor& input) : inp_H(input.size(2)) , inp_W(input.size(3)) , inp_sH(input.stride(2)) @@ -541,7 +538,7 @@ struct ApplyGridSample& out_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { auto x = compute_W.apply(grid_x); @@ -591,20 +588,16 @@ struct ApplyGridSample inline void backward(TensorAccessor* gInp_slice_ptr, TensorAccessor& gGrid_slice, - const TensorAccessor& gOut_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { - Vec x, y, gx_mult, gy_mult; - std::tie(x, gx_mult) = compute_W.apply_get_grad(grid_x); - std::tie(y, gy_mult) = compute_H.apply_get_grad(grid_y); + auto [x, gx_mult] = compute_W.apply_get_grad(grid_x); + auto [y, gy_mult] = compute_H.apply_get_grad(grid_y); - Vec n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask; - iVec i_y_n, i_x_w; - - std::tie( + auto [ n, s, w, e, nw, ne, sw, se, nw_mask, ne_mask, sw_mask, se_mask, - i_y_n, i_x_w) = compute_interp_params(x, y); + i_y_n, i_x_w] = compute_interp_params(x, y); auto i_nw_offset = i_y_n * iVec(inp_sH) + i_x_w * iVec(inp_sW); auto i_ne_offset = i_nw_offset + iVec(inp_sW); @@ -722,7 +715,7 @@ struct ApplyGridSample compute_W; const bool must_in_bound = padding != GridSamplerPadding::Zeros; - ApplyGridSample(const TensorAccessor& input) + ApplyGridSample(const TensorAccessor& input) : inp_H(input.size(2)) , inp_W(input.size(3)) , inp_sH(input.stride(2)) @@ -733,7 +726,7 @@ struct ApplyGridSample& out_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { auto x = compute_W.apply(grid_x); @@ -769,8 +762,8 @@ struct ApplyGridSample inline void backward(TensorAccessor* gInp_slice_ptr, TensorAccessor& gGrid_slice, - const TensorAccessor& gOut_slice, - const TensorAccessor& /*inp_slice*/, + const TensorAccessor& gOut_slice, + const TensorAccessor& /*inp_slice*/, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { if (input_requires_grad) { @@ -834,7 +827,7 @@ struct ApplyGridSample& input) + ApplyGridSample(const TensorAccessor& input) : inp_H(input.size(2)) , inp_W(input.size(3)) , inp_sH(input.stride(2)) @@ -913,7 +906,7 @@ struct ApplyGridSample& out_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { @@ -957,8 +950,8 @@ struct ApplyGridSample inline void backward(TensorAccessor* gInp_slice_ptr, TensorAccessor& gGrid_slice, - const TensorAccessor& gOut_slice, - const TensorAccessor& inp_slice, + const TensorAccessor& gOut_slice, + const TensorAccessor& inp_slice, int64_t offset, const Vec& grid_x, const Vec& grid_y, int64_t len) const { Vec x = compute_W.unnormalize(grid_x); @@ -1028,7 +1021,7 @@ struct ApplyGridSample static inline void grid_sample_2d_grid_slice_iterator( - const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { + const TensorAccessor& grid_slice, const ApplyFn &apply_fn) { int64_t out_H = grid_slice.size(0); int64_t out_W = grid_slice.size(1); int64_t grid_sH = grid_slice.stride(0); @@ -1193,8 +1186,8 @@ void grid_sampler_2d_cpu_kernel_impl( AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_cpu_kernel_impl", [&] { auto out_acc = output.accessor(); - auto inp_acc = input.accessor(); - auto grid_acc = grid.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); if (align_corners) { switch (static_cast(interpolation_mode)) { HANDLE_INTERP(GridSamplerInterpolation::Bilinear, true); @@ -1281,9 +1274,9 @@ void grid_sampler_2d_backward_cpu_kernel_impl( AT_DISPATCH_FLOATING_TYPES(input.scalar_type(), "grid_sampler_2d_backward_cpu_kernel_impl", [&] { auto gGrid_acc = grad_grid.accessor(); - auto inp_acc = input.accessor(); - auto grid_acc = grid.accessor(); - auto gOut_acc = grad_output.accessor(); + auto inp_acc = input.accessor(); + auto grid_acc = grid.accessor(); + auto gOut_acc = grad_output.accessor(); if (input_requires_grad) { auto gInp_acc = grad_input.accessor(); if (align_corners) { diff --git a/aten/src/ATen/native/cpu/HistogramKernel.cpp b/aten/src/ATen/native/cpu/HistogramKernel.cpp index e3a2b6c30bb80..196bfd5647a76 100644 --- a/aten/src/ATen/native/cpu/HistogramKernel.cpp +++ b/aten/src/ATen/native/cpu/HistogramKernel.cpp @@ -98,14 +98,14 @@ void histogramdd_cpu_contiguous(Tensor& hist, const TensorList& bin_edges, return; } - TensorAccessor accessor_in = input.accessor(); + TensorAccessor accessor_in = input.accessor(); /* Constructs a c10::optional containing an accessor iff * the optional weight tensor has a value. */ const auto accessor_wt = weight.has_value() - ? c10::optional>(weight.value().accessor()) - : c10::optional>(); + ? c10::optional>(weight.value().accessor()) + : c10::optional>(); std::vector bin_seq(D); std::vector num_bin_edges(D); @@ -292,10 +292,10 @@ void infer_bin_edges_from_input(const Tensor& input, const int64_t N, TORCH_INTERNAL_ASSERT(min.is_contiguous() && max.is_contiguous()); - const scalar_t *min_data = min.data_ptr(); + const scalar_t *min_data = min.const_data_ptr(); std::copy(min_data, min_data + N, leftmost_edges.begin()); - const scalar_t *max_data = max.data_ptr(); + const scalar_t *max_data = max.const_data_ptr(); std::copy(max_data, max_data + N, rightmost_edges.begin()); } diff --git a/aten/src/ATen/native/cpu/IndexKernel.cpp b/aten/src/ATen/native/cpu/IndexKernel.cpp index 36ce92f04d80b..1640d2d400edd 100644 --- a/aten/src/ATen/native/cpu/IndexKernel.cpp +++ b/aten/src/ATen/native/cpu/IndexKernel.cpp @@ -54,11 +54,12 @@ template void cpu_take_put_kernel( TensorIterator& iter, const TensorBase& indexed, + bool is_indexed_data_mutated, const func_t& f, bool serial_execution=false) { // This kernel follows the same strategy as `cpu_index_kernel` // Even though the indexed_tensor is const, we modify it through the data_ptr - // This is a bit dirty, but otherwise it would be necessary to innecessarily add tensor + // This is a bit dirty, but otherwise it would be necessary to unnecessarily add tensor // with zero strides to `iter` which would not be much better // When launch the parallel version, set a relative small grain size less than the INTERNAL::GRAIN_SIZE @@ -70,7 +71,9 @@ void cpu_take_put_kernel( const auto numel = indexed.numel(); const auto offset_indexed = IndexToOffset(indexed); - auto* indexed_data = indexed.data_ptr(); + auto* indexed_data = is_indexed_data_mutated ? + indexed.data_ptr() + : const_cast(indexed.const_data_ptr()); auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* iterated_data_bytes = data[0]; auto* index_data_bytes = data[1]; @@ -115,21 +118,21 @@ void put_kernel( bool use_parallel_for = (!is_deterministic) && ( (iter.numel() >= internal::GRAIN_SIZE) && (at::get_num_threads() > 1)); if (use_parallel_for && iter.dtype() == ScalarType::Float) { - cpu_take_put_kernel(iter, self, + cpu_take_put_kernel(iter, self, true, [](float& iterated, float* indexed, const int64_t idx) { cpu_atomic_add_float(indexed+idx, iterated); }); } else { // TODO: investigate parallelization of the accumulate kernel. // Unlike the non-accumulate case, this needs to be thread-safe. - cpu_take_put_kernel(iter, self, + cpu_take_put_kernel(iter, self, true, [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) { indexed[idx] += iterated; }, /*serial_execution=*/true); } } else { - cpu_take_put_kernel(iter, self, + cpu_take_put_kernel(iter, self, true, [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) { indexed[idx] = iterated; }); @@ -142,8 +145,8 @@ void take_kernel( const TensorBase & input) { AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3(ScalarType::Half, ScalarType::Bool, ScalarType::BFloat16, iter.dtype(), "take_cpu", [&] { - cpu_take_put_kernel(iter, input, - [](scalar_t& iterated, scalar_t* indexed, const int64_t idx) { + cpu_take_put_kernel(iter, input, false, + [](scalar_t& iterated, const scalar_t* indexed, const int64_t idx) { iterated = indexed[idx]; }); }); @@ -332,7 +335,7 @@ void masked_fill_kernel(TensorIterator& iter, const Scalar& value) { template void cpu_masked_scatter_kernel(TensorIterator& iter, const TensorBase& source) { std::ptrdiff_t source_cntr = 0; - scalar_t* source_ptr = source.data_ptr(); + const scalar_t* source_ptr = source.const_data_ptr(); auto numel = source.numel(); auto loop = [&](char** data, const int64_t* strides, int64_t n) { @@ -744,11 +747,11 @@ void flip_kernel(TensorIterator& iter, const bool quantized) { // Special cases: // a) channels last hflip on (N, C, H, W) and outer_stride(=dtype_size * C) in [2, 16] // b) flip dim=-2 on (N, ..., M, C) and outer_stride(=dtype_size * C) in [2, 16] - auto output_strides = iter.strides(0); - auto input_strides = iter.strides(1); - auto c = -output_strides[1]; + auto output_strides_2 = iter.strides(0); + auto input_strides_2 = iter.strides(1); + auto c = -output_strides_2[1]; if (c >= 2 && c <= 16 && - c == input_strides[1] && + c == input_strides_2[1] && c == iter.element_size(0) * iter.shape()[0] // checks if dim=1 is contiguous as well ) { return cpu_hflip_channels_last_vec(iter); diff --git a/aten/src/ATen/native/cpu/LerpKernel.cpp b/aten/src/ATen/native/cpu/LerpKernel.cpp index c9bf7525a76fb..7eaac38c21c8a 100644 --- a/aten/src/ATen/native/cpu/LerpKernel.cpp +++ b/aten/src/ATen/native/cpu/LerpKernel.cpp @@ -72,9 +72,8 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) { return lerp(self_val, end_val, weight_val); }, [=](bVec self_vec, bVec end_vec) -> bVec { - fVec self_vec0, self_vec1, end_vec0, end_vec1; - std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec); - std::tie(end_vec0, end_vec1) = convert_bfloat16_float(end_vec); + auto [self_vec0, self_vec1] = convert_bfloat16_float(self_vec); + auto [end_vec0, end_vec1] = convert_bfloat16_float(end_vec); auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec); auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec); return convert_float_bfloat16(result0, result1); @@ -90,9 +89,8 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) { return lerp(self_val, end_val, weight_val); }, [=](hVec self_vec, hVec end_vec) -> hVec { - fVec self_vec0, self_vec1, end_vec0, end_vec1; - std::tie(self_vec0, self_vec1) = convert_half_float(self_vec); - std::tie(end_vec0, end_vec1) = convert_half_float(end_vec); + auto [self_vec0, self_vec1] = convert_half_float(self_vec); + auto [end_vec0, end_vec1] = convert_half_float(end_vec); auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec); auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec); return convert_float_half(result0, result1); @@ -116,34 +114,30 @@ void lerp_scalar_kernel(at::TensorIteratorBase& iter, const Scalar& weight) { void lerp_tensor_kernel(at::TensorIteratorBase& iter) { if (iter.common_dtype() == kBFloat16) { using bVec = Vectorized; - using fVec = Vectorized; at::native::cpu_kernel_vec( iter, [=](BFloat16 self_val, BFloat16 end_val, BFloat16 weight_val) -> BFloat16 { return lerp(self_val, end_val, weight_val); }, [=](bVec self_vec, bVec end_vec, bVec weight_vec) -> bVec { - fVec self_vec0, self_vec1, end_vec0, end_vec1, weight_vec0, weight_vec1; - std::tie(self_vec0, self_vec1) = convert_bfloat16_float(self_vec); - std::tie(end_vec0, end_vec1) = convert_bfloat16_float(end_vec); - std::tie(weight_vec0, weight_vec1) = convert_bfloat16_float(weight_vec); + auto [self_vec0, self_vec1] = convert_bfloat16_float(self_vec); + auto [end_vec0, end_vec1] = convert_bfloat16_float(end_vec); + auto [weight_vec0, weight_vec1] = convert_bfloat16_float(weight_vec); auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec0); auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec1); return convert_float_bfloat16(result0, result1); }); } else if (iter.common_dtype() == kHalf) { using hVec = Vectorized; - using fVec = Vectorized; at::native::cpu_kernel_vec( iter, [=](Half self_val, Half end_val, Half weight_val) -> Half { return lerp(self_val, end_val, weight_val); }, [=](hVec self_vec, hVec end_vec, hVec weight_vec) -> hVec { - fVec self_vec0, self_vec1, end_vec0, end_vec1, weight_vec0, weight_vec1; - std::tie(self_vec0, self_vec1) = convert_half_float(self_vec); - std::tie(end_vec0, end_vec1) = convert_half_float(end_vec); - std::tie(weight_vec0, weight_vec1) = convert_half_float(weight_vec); + auto [self_vec0, self_vec1] = convert_half_float(self_vec); + auto [end_vec0, end_vec1] = convert_half_float(end_vec); + auto [weight_vec0, weight_vec1] = convert_half_float(weight_vec); auto result0 = lerp_vec(self_vec0, end_vec0, weight_vec0); auto result1 = lerp_vec(self_vec1, end_vec1, weight_vec1); return convert_float_half(result0, result1); diff --git a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp index 06421ee57a0bb..17e9b752d6c53 100644 --- a/aten/src/ATen/native/cpu/MaxPoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxPoolKernel.cpp @@ -65,7 +65,7 @@ template inline typename std::enable_if::value, void>::type compute_internal( - scalar_t* input_data, + const scalar_t* input_data, scalar_t* out_data, opmath_t* max_ptr, vec::int_same_size_t* index_ptr, @@ -99,7 +99,7 @@ compute_internal( for (int64_t id = id0; id < id1; id += dilationD) { for (int64_t ih = ih0; ih < ih1; ih += dilationH) { for (int64_t iw = iw0; iw < iw1; iw += dilationW) { - scalar_t* in = input_data + (n * input_depth * input_height * input_width + + const scalar_t* in = input_data + (n * input_depth * input_height * input_width + id * input_height * input_width + ih * input_width + iw) * channels; int64_t d2 = 0; @@ -138,7 +138,7 @@ template inline typename std::enable_if::value, void>::type compute_internal( - scalar_t* input_data, + const scalar_t* input_data, scalar_t* out_data, opmath_t* max_ptr, vec::int_same_size_t* index_ptr, @@ -172,15 +172,14 @@ compute_internal( for (int64_t id = id0; id < id1; id += dilationD) { for (int64_t ih = ih0; ih < ih1; ih += dilationH) { for (int64_t iw = iw0; iw < iw1; iw += dilationW) { - scalar_t* in = input_data + (n * input_depth * input_height * input_width + + const scalar_t* in = input_data + (n * input_depth * input_height * input_width + id * input_height * input_width + ih * input_width + iw) * channels; int64_t d2 = 0; for (; d2 < len; d2 += Vec::size()) { iVec index_ivec = iVec(id * input_height * input_width + ih * input_width + iw); Vec val_bvec = Vec::loadu(in + d2); - fVec val_fvec0, val_fvec1; - std::tie(val_fvec0, val_fvec1) = convert_to_float(val_bvec); + auto [val_fvec0, val_fvec1] = convert_to_float(val_bvec); iVec maxindex_ivec0 = iVec::loadu(index_ptr + d2); iVec maxindex_ivec1 = iVec::loadu(index_ptr + d2 + iVec::size()); @@ -260,7 +259,7 @@ void cpu_max_pool( auto output = output_.contiguous(); auto indices = indices_.contiguous(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -291,7 +290,7 @@ void cpu_max_pool( // parallel on dim N, C at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (int64_t c = begin; c < end; c++) { - scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width; + const scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width; scalar_t* output_ptr = output_data + c * output_depth * output_height * output_width; int64_t* indices_ptr = indices_data + c * output_depth * output_height * output_width; @@ -390,7 +389,7 @@ void cpu_max_pool_channels_last( auto output = output_.contiguous(memory_format); auto indices = indices_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); auto indices_data = indices.data_ptr(); @@ -406,7 +405,7 @@ void cpu_max_pool_channels_last( using opmath_t = at::opmath_type; using Vec = vec::Vectorized; using integer_t = vec::int_same_size_t; - // for the convience of vectorization, use integer of the same size of scalar_t, + // for the convenience of vectorization, use integer of the same size of scalar_t, // e.g. int32_t for float, int64_t for double // need to make sure doesn't overflow TORCH_CHECK(input_depth * input_height * input_width <= std::numeric_limits::max()); @@ -476,8 +475,8 @@ void cpu_max_pool_backward( auto indices = indices_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); - auto indices_data = indices.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); // treat batch size and channels as one dimension @@ -508,8 +507,8 @@ void cpu_max_pool_backward( at::parallel_for(0, channels, 0, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width; - scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width; - int64_t * indices_ptr = indices_data + c * output_depth * output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_depth * output_height * output_width; + const int64_t * indices_ptr = indices_data + c * output_depth * output_height * output_width; for (int64_t od = 0; od < output_depth; od++) { for (int64_t oh = 0; oh < output_height; oh++) { @@ -550,8 +549,8 @@ void cpu_max_pool_backward_channels_last( auto indices = indices_.contiguous(memory_format); auto grad_input_data = grad_input.mutable_data_ptr(); - auto grad_output_data = grad_output.data_ptr(); - auto indices_data = indices.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); // MaxPool2d: NHWC // MaxPool3d: NDHWC @@ -568,14 +567,14 @@ void cpu_max_pool_backward_channels_last( at::parallel_for(0, nbatch, 0, [&](int64_t begin, int64_t end) { for (const auto n : c10::irange(begin, end)) { scalar_t* grad_input_ptr = grad_input_data + n * input_depth * input_height * input_width * channels; - scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels; - int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels; + const scalar_t* grad_output_ptr = grad_output_data + n * output_depth * output_height * output_width * channels; + const int64_t* indices_ptr = indices_data + n * output_depth * output_height * output_width * channels; for (int64_t od = 0; od < output_depth; od++) { for (int64_t oh = 0; oh < output_height; oh++) { for (int64_t ow = 0; ow < output_width; ow++) { - scalar_t* gout = grad_output_ptr + (od * output_height * output_width + oh * output_width + ow) * channels; - int64_t* ind = indices_ptr + (od * output_height * output_width + oh * output_width + ow) * channels; + const scalar_t* gout = grad_output_ptr + (od * output_height * output_width + oh * output_width + ow) * channels; + const int64_t* ind = indices_ptr + (od * output_height * output_width + oh * output_width + ow) * channels; // TODO: gcc vectorization for (int64_t c = 0; c < channels; c++) { int64_t maxindex = ind[c]; diff --git a/aten/src/ATen/native/cpu/MaxPooling.cpp b/aten/src/ATen/native/cpu/MaxPooling.cpp index 70443e67ae74d..660708a2a06d6 100644 --- a/aten/src/ATen/native/cpu/MaxPooling.cpp +++ b/aten/src/ATen/native/cpu/MaxPooling.cpp @@ -39,7 +39,7 @@ void max_pool1d_impl( [&] { const Tensor in = input.contiguous(); scalar_t* const OP = output.data_ptr(); - const scalar_t* const IP = in.data_ptr(); + const scalar_t* const IP = in.const_data_ptr(); // Value used for padding scalar_t FILL = std::numeric_limits::has_infinity diff --git a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp index c9dc3eded2a19..d5af5d23e8b10 100644 --- a/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp +++ b/aten/src/ATen/native/cpu/MaxUnpoolKernel.cpp @@ -20,8 +20,8 @@ void cpu_max_unpool( const Tensor& indices) { auto output = output_.contiguous(); - auto input_data = input.data_ptr(); - auto indices_data = indices.data_ptr(); + auto input_data = input.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); auto output_data = output.data_ptr(); // NB: input tensor dimensions: @@ -105,8 +105,8 @@ void cpu_max_unpool_channels_last( auto memory_format = at::MemoryFormat::ChannelsLast; auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); - auto indices_data = indices.data_ptr(); + auto input_data = input.const_data_ptr(); + auto indices_data = indices.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -127,8 +127,8 @@ void cpu_max_unpool_channels_last( data_index_init(begin, n, nbatch, ip, input_image_size); for (const auto i : c10::irange(begin, end)) { - scalar_t* input_ptr = input_data + i * channels; - int64_t* indices_ptr = indices_data + i * channels; + const scalar_t* input_ptr = input_data + i * channels; + const int64_t* indices_ptr = indices_data + i * channels; scalar_t* output_ptr = output_data + n * output_image_size * channels; // can't do scatter on avx2 (only available on avx512) diff --git a/aten/src/ATen/native/cpu/MultinomialKernel.cpp b/aten/src/ATen/native/cpu/MultinomialKernel.cpp index c5c2eebb5d35e..1c4054abdf239 100644 --- a/aten/src/ATen/native/cpu/MultinomialKernel.cpp +++ b/aten/src/ATen/native/cpu/MultinomialKernel.cpp @@ -36,7 +36,7 @@ multinomial_with_replacement_apply( /* cumulative probability distribution vector */ Tensor cum_dist = at::empty({n_categories}, self.options()); - const scalar_t* const self_ptr = self.data_ptr(); + const scalar_t* const self_ptr = self.const_data_ptr(); scalar_t* const cum_dist_ptr = cum_dist.data_ptr(); int64_t* const result_ptr = result.data_ptr(); @@ -140,7 +140,7 @@ multinomial_with_replacement_apply( /* cumulative probability distribution vector */ Tensor cum_dist = at::empty({n_categories}, self.options().dtype(kFloat)); - const scalar_t* const self_ptr = self.data_ptr(); + const scalar_t* const self_ptr = self.const_data_ptr(); float* const cum_dist_ptr = cum_dist.data_ptr(); int64_t* const result_ptr = result.data_ptr(); diff --git a/aten/src/ATen/native/cpu/PaddingKernel.cpp b/aten/src/ATen/native/cpu/PaddingKernel.cpp index ca438f144b2de..302346c4515c9 100644 --- a/aten/src/ATen/native/cpu/PaddingKernel.cpp +++ b/aten/src/ATen/native/cpu/PaddingKernel.cpp @@ -17,7 +17,7 @@ struct PaddingParams { int64_t nbatch; int64_t channels; - // use vectorized logic on width when output index is in [pad, input_witdh + pad), + // use vectorized logic on width when output index is in [pad, input_width + pad), // applies only to Channels First format when pad_l and pad_r are both positive. bool is_padding_positive_width; @@ -136,7 +136,7 @@ void cpu_padding( auto input = input_.contiguous(); auto output = output_.contiguous(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); // fold nbatch and channels into single dimension for channels first. @@ -158,7 +158,7 @@ void cpu_padding( // do vectorized copy whe output is overlapped with input on W, // only applies to positive padding - auto loop = [=](scalar_t* out, scalar_t* in, bool positive_padding) { + auto loop = [=](scalar_t* out, const scalar_t* in, bool positive_padding) { if (positive_padding) { for (const auto ow : c10::irange(pad_w)) { int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w); @@ -198,7 +198,7 @@ void cpu_padding( for (const auto i : c10::irange(begin, end)) { int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h); scalar_t* output_ptr = output_data + i * output_width; - scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width; + const scalar_t* input_ptr = input_data + c * input_height * input_width + ih * input_width; loop(output_ptr, input_ptr, p.is_padding_positive_width); data_index_step(c, channels, oh, output_height); @@ -214,7 +214,7 @@ void cpu_padding( int64_t id = PaddingType::index(od, input_depth, pad_d, offset_d); int64_t ih = PaddingType::index(oh, input_height, pad_h, offset_h); scalar_t* output_ptr = output_data + i * output_width; - scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width + + const scalar_t* input_ptr = input_data + c * input_depth * input_height * input_width + id * input_height * input_width + ih * input_width; loop(output_ptr, input_ptr, p.is_padding_positive_width); @@ -243,7 +243,7 @@ void cpu_padding_channels_last( auto input = input_.contiguous(memory_format); auto output = output_.contiguous(memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = p.nbatch; @@ -274,7 +274,7 @@ void cpu_padding_channels_last( int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w); scalar_t* output_ptr = output_data + i * channels; - scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels; + const scalar_t* input_ptr = input_data + (n * input_height * input_width + ih * input_width + iw) * channels; copy_stub(output_ptr, input_ptr, channels); data_index_step(n, nbatch, oh, output_height, ow, output_width); @@ -292,7 +292,7 @@ void cpu_padding_channels_last( int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w); scalar_t* output_ptr = output_data + i * channels; - scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width + + const scalar_t* input_ptr = input_data + (n * input_depth * input_height * input_width + id * input_height * input_width + ih * input_width + iw) * channels; copy_stub(output_ptr, input_ptr, channels); @@ -317,7 +317,7 @@ void cpu_padding_backward( auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.data_ptr(); // fold nbatch and channels into single dimension for channels first. @@ -351,7 +351,7 @@ void cpu_padding_backward( // parallel on N,C, sequential on H,W at::parallel_for(0, channels, 1, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { - scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_height * output_width; scalar_t* grad_input_ptr = grad_input_data + c * input_height * input_width; for (const auto oh : c10::irange(output_height)) { @@ -367,7 +367,7 @@ void cpu_padding_backward( // parallel on N,C, sequential on D,H,W at::parallel_for(0, channels, 1, [&](int64_t begin, int64_t end) { for (const auto c : c10::irange(begin, end)) { - scalar_t* grad_output_ptr = grad_output_data + c * output_depth *output_height * output_width; + const scalar_t* grad_output_ptr = grad_output_data + c * output_depth *output_height * output_width; scalar_t* grad_input_ptr = grad_input_data + c * input_depth * input_height * input_width; for (const auto od : c10::irange(output_depth)) { @@ -406,7 +406,7 @@ void cpu_padding_backward_channels_last( auto grad_output = grad_output_.contiguous(memory_format); auto grad_input_data = grad_input.data_ptr(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); int64_t nbatch = p.nbatch; int64_t channels = p.channels; @@ -435,7 +435,7 @@ void cpu_padding_backward_channels_last( int64_t iw = PaddingType::index(ow, input_width, pad_w, offset_w); scalar_t* grad_input_ptr = grad_input_data + (n * input_height * input_width + ih * input_width + iw) * channels; - scalar_t* grad_output_ptr = grad_output_data + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_height * output_width + oh * output_width + ow) * channels; add_stub(grad_input_ptr, grad_output_ptr, channels); } @@ -455,7 +455,7 @@ void cpu_padding_backward_channels_last( scalar_t* grad_input_ptr = grad_input_data + (n * input_depth * input_height * input_width + id * input_height * input_width + ih * input_width + iw) * channels; - scalar_t* grad_output_ptr = grad_output_data + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width + od * output_height * output_width + oh * output_width + ow) * channels; add_stub(grad_input_ptr, grad_output_ptr, channels); diff --git a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp index b654518ae273a..d81e3c50fcea5 100644 --- a/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp +++ b/aten/src/ATen/native/cpu/PixelShuffleKernel.cpp @@ -17,7 +17,7 @@ void cpu_pixel_shuffle( TensorBase& output, const TensorBase& input, int64_t upscale_factor) { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); // [(B1...Bn), C, H, W] => [N, C, H, W] @@ -59,7 +59,7 @@ void cpu_pixel_shuffle_channels_last( int64_t upscale_factor) { TORCH_CHECK(input.ndimension() == 4, "pixel shuffle with channels last format supports tensors with 4 dims"); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); @@ -81,7 +81,7 @@ void cpu_pixel_shuffle_channels_last( data_index_init(begin, n, nbatch, h, height); for (const auto i : c10::irange(begin, end)) { for (const auto w : c10::irange(width)) { - scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels; + const scalar_t* input_ptr = input_data + n * height * width * channels + h * width * channels + w * channels; // step 1: transpose each channel lane // from: [c, s1*s2] @@ -115,7 +115,7 @@ void cpu_pixel_unshuffle( TensorBase& output, const TensorBase& input, int64_t downscale_factor) { - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); // [(B1...Bn), C, H, W] => [N, C, H, W] @@ -158,7 +158,7 @@ void cpu_pixel_unshuffle_channels_last( int64_t downscale_factor) { TORCH_CHECK(input.ndimension() == 4, "pixel unshuffle with channels last format supports tensors with 4 dims"); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t nbatch = input.size(0); diff --git a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp index 25243b2b19107..e02e57828e9b3 100644 --- a/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/PointwiseOpsKernel.cpp @@ -117,10 +117,9 @@ static void smooth_l1_backward_cpu_kernel(TensorIterator& iter, const Scalar& no // 1 if x >= beta // -1 if x <= -beta // x / beta if |x| < beta - Vectorized input0, input1, target0, target1, grad_output0, grad_output1; - std::tie(input0, input1) = convert_bfloat16_float(input); - std::tie(target0, target1) = convert_bfloat16_float(target); - std::tie(grad_output0, grad_output1) = convert_bfloat16_float(grad_output); + auto [input0, input1] = convert_bfloat16_float(input); + auto [target0, target1] = convert_bfloat16_float(target); + auto [grad_output0, grad_output1] = convert_bfloat16_float(grad_output); auto x = input0 - target0; auto pos_or_neg_1_vec = Vectorized::blendv( neg_1_vec, pos_1_vec, x > zero_vec); diff --git a/aten/src/ATen/native/cpu/Reduce.h b/aten/src/ATen/native/cpu/Reduce.h index fdb1c0d1a0fce..26155373be589 100644 --- a/aten/src/ATen/native/cpu/Reduce.h +++ b/aten/src/ATen/native/cpu/Reduce.h @@ -7,6 +7,7 @@ #include #include +#include namespace at { namespace native { inline namespace CPU_CAPABILITY { @@ -154,7 +155,7 @@ static void set_results(const std::tuple& result, const TensorIterator } template -struct all_same : guts::conjunction< +struct all_same : std::conjunction< std::is_same... > {}; diff --git a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp index 125f3ce3d11fd..04fc88d1d147e 100644 --- a/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceAllOpsKernel.cpp @@ -29,7 +29,7 @@ inline void reduce_all_impl_vec( vec_func_t vop) { using Vec = Vectorized>; const int64_t input_numel = input.numel(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); // NOTE: parallel_reduce not support bool type scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t /*ident*/) -> scalar_t { @@ -50,7 +50,7 @@ inline void reduce_all_impl( const scalar_t ident_v, func_t op) { const int64_t input_numel = input.numel(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); scalar_t result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t ident) -> scalar_t { scalar_t partial_out = ident; @@ -123,7 +123,7 @@ inline void reduce_all_impl_two_outputs( func_t2 reduce_acc_func) { using scalar_t_pair = std::pair; const int64_t input_numel = input.numel(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); scalar_t_pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t_pair& ident) -> scalar_t_pair { scalar_t_pair partial_out(ident); @@ -150,7 +150,7 @@ inline void reduce_all_impl_vec_two_outputs( using Vec = Vectorized>; using scalar_t_pair = std::pair; const int64_t input_numel = input.numel(); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); // NOTE: parallel_reduce not support bool type std::pair result = at::parallel_reduce(0, input_numel, internal::GRAIN_SIZE, ident_v, [&](int64_t start, int64_t end, const scalar_t_pair& /* ident */) -> scalar_t_pair { diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index 92250c115022c..c935f81f9ff08 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -53,7 +53,7 @@ static inline void cpu_cum_base_kernel(const Tensor& result, // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(self.sizes(), /*squash_dim=*/dim) .add_output(result) - .add_input(self) + .add_const_input(self) .build(); auto result_dim_stride = ensure_nonempty_stride(result, dim); @@ -183,8 +183,7 @@ inline void norm_two_reduce_step(Vectorized& acc_vec, Vectorized inline void norm_two_reduce_step(Vectorized& acc_fvec, Vectorized& data_bvec) { - Vectorized data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_bfloat16_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_bfloat16_float(data_bvec); acc_fvec += data_fvec0 * data_fvec0; acc_fvec += data_fvec1 * data_fvec1; } @@ -196,7 +195,7 @@ template void norm_kernel_cpu_impl(TensorIterator& iter, const double& val) { if (val == 0.0) { binary_kernel_reduce(iter, NormZeroOps(), acc_t(0)); - } else if (val == 0.0) { + } else if (val == 1.0) { binary_kernel_reduce(iter, NormOneOps(), acc_t(0)); } else if (val == 2.0) { binary_kernel_reduce(iter, NormTwoOps(), acc_t(0)); @@ -291,7 +290,9 @@ static void and_kernel_impl(TensorIterator& iter) { iter, [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; }, [=](Vectorized a, Vectorized b) { - return a & b; + // NB: != returns 0xFF rather than 0x01, so we must negate to get + // the desired result + return (a != Vectorized(0)).neg() & (b != Vectorized(0)).neg(); }, /*ident=*/true); } else { @@ -327,7 +328,7 @@ static void or_kernel_impl(TensorIterator& iter) { iter, [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; }, [=](Vectorized a, Vectorized b) { - return a | b; + return (a != Vectorized(0)).neg() | (b != Vectorized(0)).neg(); }, /*ident=*/false); } else { diff --git a/aten/src/ATen/native/cpu/ReduceUtils.h b/aten/src/ATen/native/cpu/ReduceUtils.h index c54dc494fb6fa..d6afac295aff6 100644 --- a/aten/src/ATen/native/cpu/ReduceUtils.h +++ b/aten/src/ATen/native/cpu/ReduceUtils.h @@ -158,8 +158,7 @@ inline void map_acc( constexpr int64_t kaVecSize = aVec::size(); for (d = 0; d < size - (size % kVecSize); d += kVecSize) { Vec data2_vec = Vec::loadu(input_data2 + d); - aVec data2_avec0, data2_avec1; - std::tie(data2_avec0, data2_avec1) = convert_to_float(data2_vec); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); aVec input_vec0 = aVec::loadu(input_data + d); aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize); vec_fun(input_vec0, data2_avec0).store(output_data + d); @@ -168,8 +167,7 @@ inline void map_acc( if (size - d > 0) { int64_t tail_size = size - d; Vec data2_vec = Vec::loadu(input_data2 + d, tail_size); - aVec data2_avec0, data2_avec1; - std::tie(data2_avec0, data2_avec1) = convert_to_float(data2_vec); + auto [data2_avec0, data2_avec1] = convert_to_float(data2_vec); if (tail_size > kaVecSize) { aVec input_vec0 = aVec::loadu(input_data + d); aVec input_vec1 = aVec::loadu(input_data + d + kaVecSize, tail_size - kaVecSize); @@ -199,7 +197,7 @@ inline T update(const T& x, const T& y) { } template -inline void update(scalar_t* out, scalar_t* data, int64_t K) { +inline void update(scalar_t* out, const scalar_t* data, int64_t K) { using Vec = vec::Vectorized>; map2( [](Vec x, Vec y) { return update(x, y); }, @@ -211,7 +209,7 @@ inline void update(scalar_t* out, scalar_t* data, int64_t K) { template , int> = 0> -inline void update(at::opmath_type* out, scalar_t* data, int64_t K) { +inline void update(at::opmath_type* out, const scalar_t* data, int64_t K) { using opmath_t = at::opmath_type; using Vec = vec::Vectorized; map_acc( diff --git a/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp b/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp index 731f91c349e7a..ed752f7b39364 100644 --- a/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp +++ b/aten/src/ATen/native/cpu/SampledAddmmKernel.cpp @@ -26,8 +26,8 @@ void sampled_addmm_sparse_csr_kernel_impl( auto beta_ = beta.to(); auto alpha_ = alpha.to(); - scalar_t* mat1_data = mat1.data_ptr(); - scalar_t* mat2_data = mat2.data_ptr(); + const scalar_t* mat1_data = mat1.const_data_ptr(); + const scalar_t* mat2_data = mat2.const_data_ptr(); // mat1: {B, M, K} // mat2: {B, N, K} @@ -43,8 +43,8 @@ void sampled_addmm_sparse_csr_kernel_impl( auto col = result.col_indices().reshape({-1, nnz}); auto values_acc = values.accessor(); - auto crow_acc = crow.accessor(); - auto col_acc = col.accessor(); + auto crow_acc = crow.accessor(); + auto col_acc = col.accessor(); // usually, collapse B and M is a better option, // but for most commonly used case (mat1 and mat2 is 2d tensor), B = 1, @@ -54,8 +54,8 @@ void sampled_addmm_sparse_csr_kernel_impl( auto crow_slice = crow_acc[b]; auto col_slice = col_acc[b]; auto values_slice = values_acc[b]; - scalar_t* mat1_ptr = mat1_data + b * M * K; - scalar_t* mat2_ptr = mat2_data + b * N * K; + const scalar_t* mat1_ptr = mat1_data + b * M * K; + const scalar_t* mat2_ptr = mat2_data + b * N * K; utils::parallel_sparse_csr(crow_slice, M, nnz, [&](int64_t begin, int64_t end) { for (const auto m : c10::irange(begin, end)) { diff --git a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp index cae9260b5720c..bcfc26c7df7d8 100644 --- a/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp +++ b/aten/src/ATen/native/cpu/ScatterGatherKernel.cpp @@ -186,7 +186,7 @@ struct cpu_scatter_gather_base_kernel { // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(index.sizes(), /*squash_dim=*/dim) .add_output(buffer) - .add_input(index) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(buffer, dim); @@ -273,8 +273,8 @@ struct cpu_scatter_gather_base_kernel { // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(index.sizes(), /*squash_dim=*/dim) .add_output(buffer) - .add_input(src) - .add_input(index) + .add_const_input(src) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(buffer, dim); @@ -369,8 +369,8 @@ struct cpu_scatter_gather_base_kernel { // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(index.sizes(), /*squash_dim=*/dim) .add_output(buffer) - .add_input(src) - .add_input(index) + .add_const_input(src) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(buffer, dim); @@ -464,8 +464,8 @@ struct cpu_scatter_gather_base_kernel { // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(index.sizes(), /*squash_dim=*/dim) .add_output(buffer) - .add_input(src) - .add_input(index) + .add_const_input(src) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(buffer, dim); @@ -560,8 +560,8 @@ struct cpu_scatter_gather_base_kernel { // NOLINTNEXTLINE(bugprone-argument-comment) .declare_static_shape(index.sizes(), /*squash_dim=*/dim) .add_output(buffer) - .add_input(src) - .add_input(index) + .add_const_input(src) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(buffer, dim); @@ -687,9 +687,9 @@ std::pair radix_sort_parallel( template void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index, const Tensor& src, bool include_self) { - int64_t* index_data = index.data_ptr(); + const int64_t* index_data = index.const_data_ptr(); scalar_t* self_data = self.data_ptr(); - scalar_t* src_data = src.data_ptr(); + const scalar_t* src_data = src.const_data_ptr(); const int64_t M = ensure_nonempty_size(self, 0); const int64_t nnz = ensure_nonempty_size(index, 0); @@ -812,9 +812,9 @@ void cpu_scatter_reduce_expanded_index(const Tensor& self, const Tensor& index, template void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, const Tensor& self) { - int64_t* index_data = index.data_ptr(); + const int64_t* index_data = index.const_data_ptr(); scalar_t* result_data = result.data_ptr(); - scalar_t* self_data = self.data_ptr(); + const scalar_t* self_data = self.const_data_ptr(); const int64_t M = ensure_nonempty_size(result, 0); const int64_t N = ensure_nonempty_size(self, 0); @@ -832,7 +832,7 @@ void cpu_gather_expanded_index_kernel(const Tensor& result, const Tensor& index, "index ", index, " is out of bounds for dimension ", 0, " with size ", index_upper_bound); - scalar_t* self_ptr = self_data + index * K; + const scalar_t* self_ptr = self_data + index * K; int64_t d = 0; for (; d < K - (K % Vec::size()); d += Vec::size()) { Vec out_vec = Vec::loadu(self_ptr + d); diff --git a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp index 80e5e947d692f..5f16ea72505fa 100644 --- a/aten/src/ATen/native/cpu/SoftMaxKernel.cpp +++ b/aten/src/ATen/native/cpu/SoftMaxKernel.cpp @@ -33,7 +33,7 @@ namespace at::native { namespace { template inline void _vec_log_softmax_lastdim( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t dim_size) { @@ -46,10 +46,13 @@ inline void _vec_log_softmax_lastdim( 1, at::internal::GRAIN_SIZE / (sizeof(scalar_t) * dim_size)); int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, outer_size); - - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); - - parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) { + // Note: grain_size value of 0 + // We don't change the number of OpenMP threads in the OpenMP thread-pool, + // so some threads do useful work, while others don't. + // We can simply use grain_size of 0 & rely upon invoke_parallel to distribute + // work among threads in an equitable manner. We compute CHUNK_SIZE to ensure + // each thread's computations would be efficient. + parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) { // MSVC requires such a declaration of dynamic arrays // Source: https://stackoverflow.com/a/33423538 auto tmp_sum_scalar = std::make_unique(CHUNK_SIZE); @@ -60,7 +63,7 @@ inline void _vec_log_softmax_lastdim( loop_end = end - ii; for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; - scalar_t* input_data = input_data_base + i * dim_size; + const scalar_t* input_data = input_data_base + i * dim_size; max_input_arr[j] = vec::reduce_all( [](Vec& x, Vec& y) { return vec::maximum(x, y); }, input_data, @@ -68,7 +71,7 @@ inline void _vec_log_softmax_lastdim( } for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; - scalar_t* input_data = input_data_base + i * dim_size; + const scalar_t* input_data = input_data_base + i * dim_size; scalar_t max_input = max_input_arr[j]; tmp_sum_scalar[j] = vec::map_reduce_all( [max_input](Vec x) { return (x - Vec(max_input)).exp(); }, @@ -85,7 +88,7 @@ inline void _vec_log_softmax_lastdim( loop_end); for (const auto j : c10::irange(loop_end)) { int64_t i = ii + j; - scalar_t* input_data = input_data_base + i * dim_size; + const scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; scalar_t tmp_sum = tmp_sum_scalar[j]; scalar_t max_input = max_input_arr[j]; @@ -110,15 +113,15 @@ inline void _vec_log_softmax_lastdim( template inline typename std::enable_if_t>, void> _vec_softmax_lastdim( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t dim_size) { using Vec = vec::Vectorized; - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); - parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) { + // See Note: grain_size value of 0 + parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) { for (const auto i : c10::irange(begin, end)) { - scalar_t* input_data = input_data_base + i * dim_size; + const scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; scalar_t max_input = vec::reduce_all( [](Vec& x, Vec& y) { return vec::maximum(x, y); }, @@ -144,20 +147,20 @@ _vec_softmax_lastdim( template inline typename std::enable_if_t>, void> _vec_softmax_lastdim( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t dim_size) { using Vec = vec::Vectorized; using fVec = vec::Vectorized; - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); - parallel_for(0, outer_size, grain_size, [&](int64_t begin, int64_t end) { + // See Note: grain_size value of 0 + parallel_for(0, outer_size, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer. auto buffer = std::make_unique(dim_size); float* buffer_data = buffer.get(); for (const auto i : c10::irange(begin, end)) { - scalar_t* input_data = input_data_base + i * dim_size; + const scalar_t* input_data = input_data_base + i * dim_size; scalar_t* output_data = output_data_base + i * dim_size; // reduce to max and cache float input data fVec max_fvec = fVec(-std::numeric_limits::infinity()); @@ -210,24 +213,21 @@ _vec_softmax_lastdim( template inline void _vec_host_softmax_backward_lastdim( scalar_t* grad_input_data_base, - scalar_t* grad_data_base, - scalar_t* output_data_base, + const scalar_t* grad_data_base, + const scalar_t* output_data_base, int64_t outer_size, int64_t dim_size) { using Vec = vec::Vectorized>; - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size); - if (grain_size < 1) - grain_size = 1; - + // See Note: grain_size value of 0 parallel_for( 0, outer_size, - grain_size, + 0, [&](int64_t begin, int64_t end) { for (const auto i : c10::irange(begin, end)) { scalar_t* grad_input_data = grad_input_data_base + i * dim_size; - scalar_t* grad_data = grad_data_base + i * dim_size; - scalar_t* output_data = output_data_base + i * dim_size; + const scalar_t* grad_data = grad_data_base + i * dim_size; + const scalar_t* output_data = output_data_base + i * dim_size; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) scalar_t sum; if (log_softmax) { @@ -264,21 +264,22 @@ template inline typename std::enable_if_t>, void> _vec_softmax_backward( scalar_t* grad_input_data_base, - scalar_t* grad_output_data_base, - scalar_t* output_data_base, + const scalar_t* grad_output_data_base, + const scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, int64_t dim_size) { using Vec = vec::Vectorized; int64_t outer_stride = dim_size * inner_size; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max( + int64_t MAX_CHUNK_SIZE = std::max( BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer that holds vertical sum result auto buffer = std::make_unique(CHUNK_SIZE); scalar_t* tmp_sum_data = buffer.get(); @@ -303,8 +304,8 @@ _vec_softmax_backward( for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; - scalar_t* grad_output_ptr = grad_output_data_base + offset; - scalar_t* output_ptr = output_data_base + offset; + const scalar_t* grad_output_ptr = grad_output_data_base + offset; + const scalar_t* output_ptr = output_data_base + offset; int64_t d1 = 0; for (; d1 < size - (size % Vec::size()); d1 += Vec::size()) { @@ -323,8 +324,8 @@ _vec_softmax_backward( for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; - scalar_t* grad_output_ptr = grad_output_data_base + offset; - scalar_t* output_ptr = output_data_base + offset; + const scalar_t* grad_output_ptr = grad_output_data_base + offset; + const scalar_t* output_ptr = output_data_base + offset; scalar_t* grad_input_ptr = grad_input_data_base + offset; int64_t d2 = 0; @@ -347,8 +348,8 @@ template inline typename std::enable_if_t>, void> _vec_softmax_backward( scalar_t* grad_input_data_base, - scalar_t* grad_output_data_base, - scalar_t* output_data_base, + const scalar_t* grad_output_data_base, + const scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, int64_t dim_size) { @@ -356,13 +357,14 @@ _vec_softmax_backward( using fVec = vec::Vectorized; int64_t outer_stride = dim_size * inner_size; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max( + int64_t MAX_CHUNK_SIZE = std::max( BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer that holds vertical sum result auto buffer = std::make_unique(CHUNK_SIZE); float* tmp_sum_data = buffer.get(); @@ -395,8 +397,8 @@ _vec_softmax_backward( for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; - scalar_t* grad_output_ptr = grad_output_data_base + offset; - scalar_t* output_ptr = output_data_base + offset; + const scalar_t* grad_output_ptr = grad_output_data_base + offset; + const scalar_t* output_ptr = output_data_base + offset; float* grad_output_buffer_ptr = grad_output_buffer_data + dim_idx * CHUNK_SIZE; float* output_buffer_ptr = @@ -473,21 +475,22 @@ template inline typename std::enable_if_t>, void> _vec_log_softmax_backward( scalar_t* grad_input_data_base, - scalar_t* grad_output_data_base, - scalar_t* output_data_base, + const scalar_t* grad_output_data_base, + const scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, int64_t dim_size) { using Vec = vec::Vectorized; int64_t outer_stride = dim_size * inner_size; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max( + int64_t MAX_CHUNK_SIZE = std::max( BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer that holds vertical sum result auto buffer = std::make_unique(CHUNK_SIZE); scalar_t* tmp_sum_data = buffer.get(); @@ -510,7 +513,7 @@ _vec_log_softmax_backward( // compute sum of grad_output for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { - scalar_t* grad_output_ptr = grad_output_data_base + + const scalar_t* grad_output_ptr = grad_output_data_base + outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; @@ -530,8 +533,8 @@ _vec_log_softmax_backward( for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; - scalar_t* grad_output_ptr = grad_output_data_base + offset; - scalar_t* output_ptr = output_data_base + offset; + const scalar_t* grad_output_ptr = grad_output_data_base + offset; + const scalar_t* output_ptr = output_data_base + offset; scalar_t* grad_input_ptr = grad_input_data_base + offset; int64_t d2 = 0; @@ -555,8 +558,8 @@ template inline typename std::enable_if_t>, void> _vec_log_softmax_backward( scalar_t* grad_input_data_base, - scalar_t* grad_output_data_base, - scalar_t* output_data_base, + const scalar_t* grad_output_data_base, + const scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, int64_t dim_size) { @@ -564,13 +567,14 @@ _vec_log_softmax_backward( using fVec = vec::Vectorized; int64_t outer_stride = dim_size * inner_size; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max( + int64_t MAX_CHUNK_SIZE = std::max( BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer that holds vertical sum result auto buffer = std::make_unique(CHUNK_SIZE); float* tmp_sum_data = buffer.get(); @@ -598,7 +602,7 @@ _vec_log_softmax_backward( // compute sum of grad_output for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { - scalar_t* grad_output_ptr = grad_output_data_base + + const scalar_t* grad_output_ptr = grad_output_data_base + outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; float* grad_output_buffer_ptr = @@ -632,7 +636,7 @@ _vec_log_softmax_backward( for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * outer_stride + dim_idx * inner_size + inner_idx_begin; - scalar_t* output_ptr = output_data_base + offset; + const scalar_t* output_ptr = output_data_base + offset; scalar_t* grad_input_ptr = grad_input_data_base + offset; float* grad_output_buffer_ptr = grad_output_buffer_data + dim_idx * CHUNK_SIZE; @@ -671,7 +675,7 @@ struct vec_host_softmax_lastdim { int64_t dim_size = input.size(input.ndimension() - 1); for (int64_t i = 0; i < input.ndimension() - 1; ++i) outer_size *= input.size(i); - scalar_t* input_data_base = input.data_ptr(); + const scalar_t* input_data_base = input.const_data_ptr(); scalar_t* output_data_base = output.data_ptr(); if (LogSoftMax) { _vec_log_softmax_lastdim( @@ -686,7 +690,7 @@ struct vec_host_softmax_lastdim { template inline typename std::enable_if_t>, void> _vec_softmax( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, @@ -695,10 +699,10 @@ _vec_softmax( using Vec16 = vec::Vectorized; int64_t dim_stride = inner_size; int64_t outer_stride = dim_size * dim_stride; - int64_t grain_size = internal::GRAIN_SIZE / dim_size; int vectorized_step = Vec16().size(); // Currently, we only support BFloat16/Half in this special implementation + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) { int64_t idx = begin; std::unique_ptr temp_vec_input(new float[dim_size*vectorized_step]()); std::unique_ptr temp_vec_output(new float[dim_size*vectorized_step]()); @@ -709,7 +713,7 @@ _vec_softmax( int64_t inner_idx = idx % inner_size; if (((inner_idx + vectorized_step) <= inner_size) && ((idx + vectorized_step) <= end)) { // Vectorization - scalar_t* input_data = + const scalar_t* input_data = input_data_base + outer_idx * outer_stride + inner_idx; scalar_t* output_data = output_data_base + outer_idx * outer_stride + inner_idx; @@ -756,13 +760,13 @@ _vec_softmax( // Tail case(Scalar): it is exactly same logic as host_softmax // inside aten/src/ATen/native/SoftMax.cpp. There are 2 kind of // cases which will fall through this part: - // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization. - // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization. + // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallelization. + // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallelization. int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx); for (const auto i : c10::irange(tail_number)) { outer_idx = (idx + i) / inner_size; inner_idx = (idx + i) % inner_size; - scalar_t* input_data = + const scalar_t* input_data = input_data_base + outer_idx * outer_stride + inner_idx; scalar_t* output_data = output_data_base + outer_idx * outer_stride + inner_idx; @@ -794,7 +798,7 @@ _vec_softmax( template inline typename std::enable_if_t>, void> _vec_softmax( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, @@ -802,17 +806,17 @@ _vec_softmax( using Vec = vec::Vectorized; int64_t dim_stride = inner_size; int64_t outer_stride = dim_size * dim_stride; - int64_t grain_size = internal::GRAIN_SIZE / dim_size; int vectorized_step = Vec().size(); + // See Note: grain_size value of 0 parallel_for( - 0, outer_size * inner_size, grain_size, [&](int64_t begin, int64_t end) { + 0, outer_size * inner_size, 0, [&](int64_t begin, int64_t end) { int64_t idx = begin; while (idx < end) { int64_t outer_idx = idx / inner_size; int64_t inner_idx = idx % inner_size; if (((inner_idx + vectorized_step) <= inner_size) && ((idx + vectorized_step) <= end)) { // Vectorization - scalar_t* input_data = + const scalar_t* input_data = input_data_base + outer_idx * outer_stride + inner_idx; scalar_t* output_data = output_data_base + outer_idx * outer_stride + inner_idx; @@ -841,13 +845,13 @@ _vec_softmax( // Tail case(Scalar): it is exactly same logic as host_softmax // inside aten/src/ATen/native/SoftMax.cpp. There are 2 kind of // cases which will fall through this part: - // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallization. - // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallization. + // Case 1: For the idx at the end of total chunk for each thread, there are not enough numbers for parallelization. + // Case 2: For the idx at the end of each inner_size inside thread, there are not enough numbers for parallelization. int64_t tail_number = ((idx+vectorized_step) > end) ? /*Case1*/ (end - idx) : /*Case2*/ (inner_size - inner_idx); for (const auto i : c10::irange(tail_number)) { outer_idx = (idx + i) / inner_size; inner_idx = (idx + i) % inner_size; - scalar_t* input_data = + const scalar_t* input_data = input_data_base + outer_idx * outer_stride + inner_idx; scalar_t* output_data = output_data_base + outer_idx * outer_stride + inner_idx; @@ -878,7 +882,7 @@ _vec_softmax( // NB: fast kernel for log_softmax when dim != -1 // input shape is normalized to {outer_size, dim_size, inner_size} // -// The algorithm requires to load input tensor 3 times, to increase parallelsim +// The algorithm requires to load input tensor 3 times, to increase parallelism // and cache hit rate, inner_size is blocked as: // inner_size: {CHUNK_SIZE, CHUNK_SIZE, ..., Remainder} // @@ -888,19 +892,20 @@ _vec_softmax( template inline typename std::enable_if_t>, void> _vec_logsoftmax( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, int64_t dim_size) { using Vec = vec::Vectorized; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t MAX_CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); - at::parallel_for(0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + // See Note: grain_size value of 0 + at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { // thread local temp buffer which holds vertical reduction result: max and sum. auto buffer = std::make_unique(CHUNK_SIZE * 2); scalar_t* input_max_data = buffer.get(); @@ -927,7 +932,7 @@ _vec_logsoftmax( // compute max for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { - scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; int64_t d1 = 0; @@ -946,7 +951,7 @@ _vec_logsoftmax( // compute sum of (x - max).exp() for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { - scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; int64_t d2 = 0; @@ -970,7 +975,7 @@ _vec_logsoftmax( // compute x - max - sum for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { int64_t offset = outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; - scalar_t* input_ptr = input_data_base + offset; + const scalar_t* input_ptr = input_data_base + offset; scalar_t* output_ptr = output_data_base + offset; int64_t d3 = 0; @@ -992,7 +997,7 @@ _vec_logsoftmax( template inline typename std::enable_if_t>, void> _vec_logsoftmax( - scalar_t* input_data_base, + const scalar_t* input_data_base, scalar_t* output_data_base, int64_t outer_size, int64_t inner_size, @@ -1000,12 +1005,13 @@ _vec_logsoftmax( using Vec = vec::Vectorized; using fVec = vec::Vectorized; int64_t BLOCK_SIZE = 128 * 1024; - int64_t CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); - CHUNK_SIZE = CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t MAX_CHUNK_SIZE = std::max(BLOCK_SIZE / dim_size / sizeof(scalar_t), Vec::size()); + MAX_CHUNK_SIZE = MAX_CHUNK_SIZE / Vec::size() * Vec::size(); + int64_t CHUNK_SIZE = std::min(MAX_CHUNK_SIZE, inner_size); int64_t num_chunks = divup(inner_size, CHUNK_SIZE); - int64_t grain_size = internal::GRAIN_SIZE / (16 * dim_size * CHUNK_SIZE); - at::parallel_for(0, outer_size * num_chunks, grain_size, [&](int64_t begin, int64_t end) { + // See Note: grain_size value of 0 + at::parallel_for(0, outer_size * num_chunks, 0, [&](int64_t begin, int64_t end) { auto buffer = std::make_unique(CHUNK_SIZE * 2); float* input_max_data = buffer.get(); float* tmp_sum_data = buffer.get() + CHUNK_SIZE; @@ -1037,7 +1043,7 @@ _vec_logsoftmax( // compute max for (int64_t dim_idx = 0; dim_idx < dim_size; dim_idx++) { - scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + const scalar_t* input_ptr = input_data_base + outer_idx * dim_size * inner_size + dim_idx * inner_size + inner_idx_begin; float* input_buffer_ptr = input_buffer_data + dim_idx * CHUNK_SIZE; @@ -1127,7 +1133,7 @@ struct vec_softmax { for (const auto i : c10::irange(dim))outer_size *= input.size(i); for (int64_t i = dim + 1; i < input.dim(); ++i) inner_size *= input.size(i); - scalar_t* input_data_base = input.data_ptr(); + const scalar_t* input_data_base = input.const_data_ptr(); scalar_t* output_data_base = output.data_ptr(); if (LogSoftMax) { _vec_logsoftmax( @@ -1148,8 +1154,8 @@ struct vec_host_softmax_backward_lastdim { for (int64_t i = 0; i < grad.ndimension() - 1; ++i) outer_size *= grad.size(i); scalar_t* grad_input_data_base = grad_input.mutable_data_ptr(); - scalar_t* grad_data_base = grad.data_ptr(); - scalar_t* output_data_base = output.data_ptr(); + const scalar_t* grad_data_base = grad.const_data_ptr(); + const scalar_t* output_data_base = output.const_data_ptr(); _vec_host_softmax_backward_lastdim( grad_input_data_base, grad_data_base, @@ -1176,8 +1182,8 @@ struct vec_host_softmax_backward { inner_size *= grad.size(i); } scalar_t* grad_input_data_base = grad_input.mutable_data_ptr(); - scalar_t* grad_output_data_base = grad.data_ptr(); - scalar_t* output_data_base = output.data_ptr(); + const scalar_t* grad_output_data_base = grad.const_data_ptr(); + const scalar_t* output_data_base = output.const_data_ptr(); if (LogSoftMax) { _vec_log_softmax_backward( grad_input_data_base, diff --git a/aten/src/ATen/native/cpu/SortingKernel.cpp b/aten/src/ATen/native/cpu/SortingKernel.cpp index 89756906580a8..22ba0152153d3 100644 --- a/aten/src/ATen/native/cpu/SortingKernel.cpp +++ b/aten/src/ATen/native/cpu/SortingKernel.cpp @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -42,9 +43,8 @@ void _dim_apply( auto indices_dim_stride = indices.stride(dim); auto dim_size = values.size(dim); - AT_DISPATCH_ALL_TYPES_AND3( - ScalarType::Bool, ScalarType::Half, ScalarType::BFloat16, iter.dtype(), - "sorting_kernel_method_name", [&] { + AT_DISPATCH_V2( + iter.dtype(), "sorting_kernel_method_name", AT_WRAP([&] { auto loop = [&](char** data, const int64_t* strides, int64_t n) { auto* values_data_bytes = data[0]; auto* indices_data_bytes = data[1]; @@ -69,7 +69,7 @@ void _dim_apply( int64_t grain_size = internal::GRAIN_SIZE / std::max(int64_t{1}, dim_size); iter.for_each(loop, /*grain_size=*/grain_size); - } + }), kBool, kHalf, kBFloat16, AT_EXPAND(AT_ALL_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES) ); } @@ -216,7 +216,7 @@ static void topk_kernel( .declare_static_shape(sizes, /*squash_dims=*/dim) .add_output(values) .add_output(indices) - .add_input(self) + .add_const_input(self) .build(); auto mode_values_stride = values.strides()[dim]; diff --git a/aten/src/ATen/native/cpu/SparseFactories.cpp b/aten/src/ATen/native/cpu/SparseFactories.cpp index 8f938e545f27a..2c0b54b8dd7af 100644 --- a/aten/src/ATen/native/cpu/SparseFactories.cpp +++ b/aten/src/ATen/native/cpu/SparseFactories.cpp @@ -29,7 +29,7 @@ void _spdiags_kernel_cpu( "spdiags_cpu", [&] { auto* const values_write_ptr = values.data_ptr(); - const auto* const diagonals_ptr = diagonals.data_ptr(); + const auto* const diagonals_ptr = diagonals.const_data_ptr(); cpu_kernel( iter, diff --git a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp index d9aa9a35f1b0d..36f36746dbd89 100644 --- a/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp +++ b/aten/src/ATen/native/cpu/SpmmReduceKernel.cpp @@ -24,7 +24,7 @@ namespace at { namespace native { namespace { template -inline void _update(at::opmath_type* out_ptr, int64_t e, int64_t c, const scalar_t val, scalar_t* other_data, int64_t K) { +inline void _update(at::opmath_type* out_ptr, int64_t e, int64_t c, const scalar_t val, const scalar_t* other_data, int64_t K) { using opmath_t = at::opmath_type; using Vec = vec::Vectorized; using aVec = VecType; @@ -33,7 +33,7 @@ inline void _update(at::opmath_type* out_ptr, int64_t e, int64_t c, co int64_t k = 0; aVec val_vec = aVec((opmath_t)val); - scalar_t* other_ptr = other_data + c * K; + const scalar_t* other_ptr = other_data + c * K; for (; k < K - (K % kVLEN); k += kVLEN) { aVec out_vec0 = aVec::loadu(out_ptr + k); @@ -78,12 +78,12 @@ void spmm_reduce_kernel_impl( auto other = other_.contiguous(); - // access `crow_indices`, `col_indices` and `values` via TessorAccessor + // access `crow_indices`, `col_indices` and `values` via TensorAccessor scalar_t* out_data = out.data_ptr(); - auto csr_data = crow_indices.accessor(); - auto col_data = col_indices.accessor(); - auto val_data = values.accessor(); - scalar_t* other_data = other.data_ptr(); + auto csr_data = crow_indices.accessor(); + auto col_data = col_indices.accessor(); + auto val_data = values.accessor(); + const scalar_t* other_data = other.const_data_ptr(); int64_t M = crow_indices.numel() - 1; int64_t K = other.size(-1); @@ -178,10 +178,10 @@ void spmm_reduce_arg_kernel_impl( scalar_t* out_data = out.data_ptr(); index_t* arg_out_data = arg_out.data_ptr(); - auto csr_data = crow_indices.accessor(); - auto col_data = col_indices.accessor(); - auto val_data = values.accessor(); - scalar_t* other_data = other.data_ptr(); + auto csr_data = crow_indices.accessor(); + auto col_data = col_indices.accessor(); + auto val_data = values.accessor(); + const scalar_t* other_data = other.const_data_ptr(); int64_t M = crow_indices.numel() - 1; int64_t K = other.size(-1); @@ -222,7 +222,7 @@ void spmm_reduce_arg_kernel_impl( c = col_data[e]; opmath_t val = opmath_t(val_data[e]); - scalar_t* other_ptr = other_data + c * K; + const scalar_t* other_ptr = other_data + c * K; for (const auto k : c10::irange(K)) { update_with_index( &buffer_ptr[k], opmath_t(val * other_ptr[k]), &arg_out_ptr[k], index_t(e)); @@ -257,11 +257,11 @@ void spmm_reduce_backward_input_kernel_impl( auto values = grad_self.values(); auto grad_values_data = values.accessor(); - scalar_t* grad_out_data = grad_out.data_ptr(); - auto crow_data = crow_indices.accessor(); - auto col_data = col_indices.accessor(); - scalar_t* other_data = other.data_ptr(); - auto row_data = row_indices.accessor(); + const scalar_t* grad_out_data = grad_out.const_data_ptr(); + auto crow_data = crow_indices.accessor(); + auto col_data = col_indices.accessor(); + const scalar_t* other_data = other.const_data_ptr(); + auto row_data = row_indices.accessor(); int64_t K = grad_out.size(1); @@ -307,9 +307,9 @@ void spmm_reduce_backward_input_arg_kernel_impl( auto grad_values = grad_self.values(); auto grad_values_data = grad_values.accessor(); - scalar_t* grad_out_data = grad_out.data_ptr(); - auto col_data = col_indices.accessor(); - scalar_t* other_data = other.data_ptr(); + const scalar_t* grad_out_data = grad_out.const_data_ptr(); + auto col_data = col_indices.accessor(); + const scalar_t* other_data = other.const_data_ptr(); index_t* arg_out_data = arg_out.data_ptr(); int64_t M = grad_out.size(0); @@ -319,7 +319,7 @@ void spmm_reduce_backward_input_arg_kernel_impl( at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { for (const auto m : c10::irange(begin, end)) { - scalar_t* grad_out_ptr = grad_out_data + m * K; + const scalar_t* grad_out_ptr = grad_out_data + m * K; scalar_t* grad_ptr = grad_data + m * K; index_t* arg_out_ptr = arg_out_data + m * K; @@ -389,10 +389,10 @@ void spmm_reduce_backward_other_arg_kernel_impl( auto arg_out = arg_out_.contiguous(); scalar_t* grad_other_data = grad_other.data_ptr(); - scalar_t* grad_out_data = grad_out.data_ptr(); - auto col_data = col_indices.accessor(); - auto values_data = values.accessor(); - index_t* arg_out_data = arg_out.data_ptr(); + const scalar_t* grad_out_data = grad_out.const_data_ptr(); + auto col_data = col_indices.accessor(); + auto values_data = values.accessor(); + const index_t* arg_out_data = arg_out.const_data_ptr(); int64_t M = grad_out.size(0); int64_t K = grad_out.size(1); @@ -401,9 +401,9 @@ void spmm_reduce_backward_other_arg_kernel_impl( at::parallel_for(0, M, 1, [&](int64_t begin, int64_t end) { for (const auto m : c10::irange(begin, end)) { - scalar_t* grad_out_ptr = grad_out_data + m * K; + const scalar_t* grad_out_ptr = grad_out_data + m * K; scalar_t* grad_ptr = grad_data + m * K; - index_t* arg_out_ptr = arg_out_data + m * K; + const index_t* arg_out_ptr = arg_out_data + m * K; for (const auto k : c10::irange(K)) { if (arg_out_ptr[k] == index_t(nnz)) { diff --git a/aten/src/ATen/native/cpu/SumKernel.cpp b/aten/src/ATen/native/cpu/SumKernel.cpp index 3f0fde5d4b6e2..7865a6a82d272 100644 --- a/aten/src/ATen/native/cpu/SumKernel.cpp +++ b/aten/src/ATen/native/cpu/SumKernel.cpp @@ -6,7 +6,7 @@ #include #include #include - +#include #include namespace at::native { @@ -82,8 +82,13 @@ struct CastLoadPolicy: }; // For inner sum, load full vec_t then sum partials down to vacc_t size +template +struct InnerSumCastLoadPolicy; + template -struct InnerSumCastLoadPolicy { +struct InnerSumCastLoadPolicy >) && + !std::is_same_v>> { using scalar_t = vechold_type; using acc_t = vechold_type; @@ -100,30 +105,35 @@ struct InnerSumCastLoadPolicy { }; template -struct InnerSumCastLoadPolicy: +struct InnerSumCastLoadPolicy: LoadPolicy { }; -template <> -struct InnerSumCastLoadPolicy, Vectorized> { - using vec_t = Vectorized; - using vacc_t = Vectorized; +template +struct InnerSumCastLoadPolicy >>> { + using scalar_t = vechold_type; static constexpr int64_t memsize() { return LoadPolicy::memsize(); } static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) { - auto ptr = reinterpret_cast(data + stride * index); + auto ptr = reinterpret_cast(data + stride * index); vacc_t first, second; - vec::load_fp32_from_bf16(ptr, first, second); + vec::load_to_float(ptr, first, second); return first + second; } }; // For outer sum, load a partial vec_t of size vacc_t then cast to vacc_t +template +struct OuterSumCastLoadPolicy; + template -struct OuterSumCastLoadPolicy { +struct OuterSumCastLoadPolicy >) && + !std::is_same_v>> { + using scalar_t = vechold_type; using acc_t = vechold_type; @@ -146,25 +156,24 @@ struct OuterSumCastLoadPolicy { } }; -template <> -struct OuterSumCastLoadPolicy, Vectorized> { - using vec_t = Vectorized; - using vacc_t = Vectorized; +template +struct OuterSumCastLoadPolicy >>> { + using scalar_t = vechold_type; static constexpr int64_t memsize() { - return sizeof(c10::BFloat16) * vacc_t::size(); + return sizeof(scalar_t) * vacc_t::size(); } static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) { - auto ptr = reinterpret_cast(data + stride * index); + auto ptr = reinterpret_cast(data + stride * index); vacc_t values; - vec::load_fp32_from_bf16(ptr, values); + vec::load_to_float(ptr, values); return values; } }; template -struct OuterSumCastLoadPolicy: +struct OuterSumCastLoadPolicy: LoadPolicy { }; @@ -210,8 +219,13 @@ struct NanSumCastLoadPolicy { } }; +template +struct InnerNanSumCastLoadPolicy; + template -struct InnerNanSumCastLoadPolicy { +struct InnerNanSumCastLoadPolicy >) && + !std::is_same_v>> { using scalar_t = vechold_type; using acc_t = vechold_type; @@ -228,23 +242,22 @@ struct InnerNanSumCastLoadPolicy { }; template -struct InnerNanSumCastLoadPolicy : +struct InnerNanSumCastLoadPolicy: NanSumLoadPolicy { }; -template <> -struct InnerNanSumCastLoadPolicy, Vectorized> { - using vec_t = Vectorized; - using vacc_t = Vectorized; +template +struct InnerNanSumCastLoadPolicy >>> { + using scalar_t = vechold_type; static constexpr int64_t memsize() { return LoadPolicy::memsize(); } static vacc_t load(const char * C10_RESTRICT data, int64_t stride, int64_t index) { - auto ptr = reinterpret_cast(data + stride * index); + auto ptr = reinterpret_cast(data + stride * index); vacc_t first, second; - vec::load_fp32_from_bf16(ptr, first, second); + vec::load_to_float(ptr, first, second); const vacc_t zero(0); return (vacc_t::blendv(first, zero, first.isnan()) + vacc_t::blendv(second, zero, second.isnan())); diff --git a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp index f014c34c7e2e0..984e60056af9a 100644 --- a/aten/src/ATen/native/cpu/TensorCompareKernel.cpp +++ b/aten/src/ATen/native/cpu/TensorCompareKernel.cpp @@ -59,7 +59,7 @@ static inline void compare_base_kernel_core( .declare_static_shape(self.sizes(), /*squash_dims=*/dim) .add_output(result1) .add_output(result2) - .add_input(self) + .add_const_input(self) .build(); iter.for_each(loop, /* grain_size */ 1); @@ -320,13 +320,13 @@ static void isin_default_kernel_cpu( auto iter = TensorIteratorConfig() .add_output(out) - .add_input(promoted_elements) + .add_const_input(promoted_elements) .check_all_same_dtype(false) .build(); // Dispatch based on promoted type. AT_DISPATCH_ALL_TYPES(iter.dtype(1), "isin_default_cpu", [&]() { cpu_kernel(iter, [&](scalar_t element_val) -> bool { - const auto* test_element_data = test_elements_flat.data_ptr(); + const auto* test_element_data = test_elements_flat.const_data_ptr(); for (const auto j : c10::irange(test_elements_flat.numel())) { if (element_val == *(test_element_data + test_elements_stride * j)) { return !invert; diff --git a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp index a966e4ac6dd18..461ceb2f36383 100644 --- a/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/UnaryOpsKernel.cpp @@ -18,7 +18,6 @@ #include #include -#include #include #include #include @@ -45,8 +44,7 @@ static void sigmoid_kernel(TensorIteratorBase& iter) { return static_cast(1) / (static_cast(1) + std::exp((-a0))); }, [=](Vectorized a) { - Vectorized a0, a1; - std::tie(a0, a1) = convert_to_float(a); + auto [a0, a1] = convert_to_float(a); a0 = (Vectorized(static_cast(1)) + a0.neg().exp()).reciprocal(); a1 = (Vectorized(static_cast(1)) + a1.neg().exp()).reciprocal(); return convert_from_float(a0, a1); @@ -145,6 +143,7 @@ static void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) { const scalar_t eps = eps_scalar.to(); if (at::hasMKL() && iter.is_contiguous()) { LogitMKLKernel(eps, &iter); + iter.cast_outputs(); } else if (eps < scalar_t(0)) { const Vectorized kOneVec(scalar_t(1)); cpu_kernel_vec( @@ -180,9 +179,9 @@ static void logit_kernel(TensorIteratorBase& iter, const Scalar& eps_scalar) { } #if !defined(C10_MOBILE) -#define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...) \ - AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( \ - kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, \ +#define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...) \ + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND6( \ + kHalf, kBFloat16, kFloat8_e5m2, kFloat8_e4m3fn, kFloat8_e5m2fnuz, kFloat8_e4m3fnuz, \ TYPE, NAME, __VA_ARGS__) #else #define _AT_DISPATCH_ABS_TYPES(TYPE, NAME, ...) \ @@ -356,8 +355,9 @@ static void sinc_kernel(TensorIteratorBase& iter) { if (a == scalar_t(0)) { return scalar_t(1); } else { - scalar_t product = c10::pi * a; - return std::sin(product) / product; + using opmath_t = at::opmath_type; + opmath_t product = c10::pi * opmath_t{a}; + return static_cast(std::sin(product) / product); } }); }); @@ -523,8 +523,8 @@ static void kaiser_window_kernel(TensorIteratorBase& iter, int64_t window_length using opmath_t = at::opmath_type; const opmath_t alpha = static_cast((window_length - 1) / 2.0); const opmath_t beta_ = static_cast(beta); - cpu_kernel(iter, [=](scalar_t a){ - return calc_i0(beta_ * std::sqrt(1 - std::pow((static_cast(a) - alpha) / alpha, static_cast(2.0)))) / calc_i0(beta_); + cpu_kernel(iter, [=](scalar_t a) -> scalar_t { + return calc_i0(beta_ * std::sqrt(std::abs(1 - std::pow((static_cast(a) - alpha) / alpha, static_cast(2.0))))) / calc_i0(beta_); }); }); } diff --git a/aten/src/ATen/native/cpu/Unfold2d.cpp b/aten/src/ATen/native/cpu/Unfold2d.cpp index bb35ef23b8eaa..026cfa812f3c6 100644 --- a/aten/src/ATen/native/cpu/Unfold2d.cpp +++ b/aten/src/ATen/native/cpu/Unfold2d.cpp @@ -228,7 +228,7 @@ void unfolded2d_acc_kernel( template static void unfolded2d_copy( - scalar_t* input_data, + const scalar_t* input_data, scalar_t* finput_data, int64_t kH, int64_t kW, @@ -256,7 +256,7 @@ static void unfolded2d_copy( nip * ((size_t)kH * kW * output_height * output_width) + kh * ((size_t)kW * output_height * output_width) + kw * ((size_t)output_height * output_width); - scalar_t* src = + const scalar_t* src = input_data + nip * ((size_t)input_height * input_width); if (padW > 0 || padH > 0) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) @@ -335,7 +335,7 @@ static void unfolded2d_copy( template static void unfolded2d_copy_channels_last( - scalar_t* input_data, + const scalar_t* input_data, scalar_t* finput_data, int64_t kH, int64_t kW, @@ -355,7 +355,7 @@ static void unfolded2d_copy_channels_last( for (const auto k C10_UNUSED: c10::irange(start, end)) { scalar_t* dst = finput_data + y * output_width * kH * kW * n_input_plane + x * kH * kW * n_input_plane; - scalar_t* src = input_data; + const scalar_t* src = input_data; if (padW > 0 || padH > 0) { for (int64_t kh = 0; kh < kH; kh++) { @@ -393,7 +393,7 @@ static void unfolded2d_copy_channels_last( void unfolded2d_copy_kernel( ScalarType dtype, void *finput_data, - void *input_data, + const void *input_data, int64_t kH, int64_t kW, int64_t dH, @@ -415,7 +415,7 @@ void unfolded2d_copy_kernel( if (is_channels_last) { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, dtype, "unfolded2d_copy_channels_last", [&] { unfolded2d_copy_channels_last( - static_cast(input_data), + static_cast(input_data), static_cast(finput_data), kH, kW, dH, dW, @@ -429,7 +429,7 @@ void unfolded2d_copy_kernel( } else { AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::BFloat16, at::ScalarType::Half, dtype, "unfolded2d_copy", [&] { unfolded2d_copy( - static_cast(input_data), + static_cast(input_data), static_cast(finput_data), kH, kW, dH, dW, diff --git a/aten/src/ATen/native/cpu/UpSampleKernel.cpp b/aten/src/ATen/native/cpu/UpSampleKernel.cpp index bee568881a95e..67fe50c1d2a62 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleKernel.cpp @@ -73,30 +73,30 @@ using scale_t = std::vector>; // - recursively compute interpolated output for each dimension // - we rely a lot on compiler's code optimization such that implemented operations // can be automatically factorized and vectorized using SSE and AVX2 -template +template struct Interpolate { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t ids = *(index_t*)&data[0][i * strides[0]]; - scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; - scalar_t t = Interpolate::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); - scalar_t output = t * wts; + opmath_t wts = *(scalar_t*)&data[1][i * strides[1]]; + opmath_t t = Interpolate::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); + opmath_t output = t * wts; for (const auto j : c10::irange(1, interp_size)) { ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]]; wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]]; - t = Interpolate::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); + t = Interpolate::eval(src + ids, &data[2 * interp_size], &strides[2 * interp_size], i); output += t * wts; } return output; } }; -template -struct Interpolate<1, scalar_t, index_t, interp_size> { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { +template +struct Interpolate<1, scalar_t, opmath_t, index_t, interp_size> { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t ids = *(index_t*)&data[0][i * strides[0]]; - scalar_t wts = *(scalar_t*)&data[1][i * strides[1]]; - scalar_t t = *(scalar_t *)&src[ids]; - scalar_t output = t * wts; + opmath_t wts = *(scalar_t*)&data[1][i * strides[1]]; + opmath_t t = *(scalar_t *)&src[ids]; + opmath_t output = t * wts; for (const auto j : c10::irange(1, interp_size)) { ids = *(index_t*)&data[2 * j + 0][i * strides[2 * j + 0]]; wts = *(scalar_t*)&data[2 * j + 1][i * strides[2 * j + 1]]; @@ -107,17 +107,17 @@ struct Interpolate<1, scalar_t, index_t, interp_size> { } }; -template -struct Interpolate { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { +template +struct Interpolate { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t ids = *(index_t*)&data[0][i * strides[0]]; - return Interpolate::eval(src + ids, &data[2], &strides[2], i); + return Interpolate::eval(src + ids, &data[2], &strides[2], i); } }; -template -struct Interpolate<1, scalar_t, index_t, 1> { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { +template +struct Interpolate<1, scalar_t, opmath_t, index_t, 1> { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t ids = *(index_t*)&data[0][i * strides[0]]; return *(scalar_t *)&src[ids]; } @@ -128,37 +128,38 @@ struct Interpolate<1, scalar_t, index_t, 1> { // Once the issue is fixed we can keep generic implementation and remove: // struct Interpolate and // struct Interpolate<1, scalar_t, index_t, 2> -template -struct Interpolate { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { +template +struct Interpolate { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t i0 = *(index_t*)&data[0][i * strides[0]]; index_t i1 = *(index_t*)&data[2][i * strides[2]]; - scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]]; - scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]]; + opmath_t w0 = *(scalar_t *)&data[1][i * strides[1]]; + opmath_t w1 = *(scalar_t *)&data[3][i * strides[3]]; - scalar_t t0 = Interpolate::eval(src + i0, &data[4], &strides[4], i); - scalar_t t1 = Interpolate::eval(src + i1, &data[4], &strides[4], i); + opmath_t t0 = Interpolate::eval(src + i0, &data[4], &strides[4], i); + opmath_t t1 = Interpolate::eval(src + i1, &data[4], &strides[4], i); return t0 * w0 + t1 * w1; } }; -template -struct Interpolate<1, scalar_t, index_t, 2> { - static inline scalar_t eval(char* src, char** data, const int64_t* strides, int64_t i) { +template +struct Interpolate<1, scalar_t, opmath_t, index_t, 2> { + static inline opmath_t eval(char* src, char** data, const int64_t* strides, int64_t i) { index_t i0 = *(index_t*)&data[0][i * strides[0]]; index_t i1 = *(index_t*)&data[2][i * strides[2]]; - scalar_t w0 = *(scalar_t *)&data[1][i * strides[1]]; - scalar_t w1 = *(scalar_t *)&data[3][i * strides[3]]; - scalar_t t0 = *(scalar_t *)&src[i0]; - scalar_t t1 = *(scalar_t *)&src[i1]; + opmath_t w0 = *(scalar_t *)&data[1][i * strides[1]]; + opmath_t w1 = *(scalar_t *)&data[3][i * strides[3]]; + opmath_t t0 = *(scalar_t *)&src[i0]; + opmath_t t1 = *(scalar_t *)&src[i1]; return t0 * w0 + t1 * w1; } }; template static inline scalar_t interpolate(char* src, char** data, const int64_t* strides, int64_t i) { - return Interpolate::eval(src, data, strides, i); + using opmath_t = at::opmath_type; + return Interpolate::eval(src, data, strides, i); } template @@ -472,7 +473,7 @@ void cpu_upsample_nearest_channels_last( auto input = input_.contiguous(channels_last_memory_format); auto output = output_.contiguous(channels_last_memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t num_batches = input_sizes[0]; @@ -488,7 +489,7 @@ void cpu_upsample_nearest_channels_last( TORCH_CHECK(channels > 0, "expected input and output channels greater than 0 but got ", channels); using Vec = vec::Vectorized; - auto copy = [](scalar_t* out, scalar_t* in, int64_t size) { + auto copy = [](scalar_t* out, const scalar_t* in, int64_t size) { int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec out_vec = Vec::loadu(in + d); @@ -509,7 +510,7 @@ void cpu_upsample_nearest_channels_last( int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); scalar_t* output_ptr = output_data + i * channels; - scalar_t* input_ptr = input_data + n * input_height * input_width * channels + + const scalar_t* input_ptr = input_data + n * input_height * input_width * channels + ih * input_width * channels + iw * channels; copy(output_ptr, input_ptr, channels); data_index_step(n, num_batches, oh, output_height, ow, output_width); @@ -528,7 +529,7 @@ void cpu_upsample_nearest_channels_last( int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); scalar_t* output_ptr = output_data + i * channels; - scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels + + const scalar_t* input_ptr = input_data + n * input_depth * input_height * input_width * channels + id * input_height * input_width * channels + ih * input_width * channels + iw * channels; copy(output_ptr, input_ptr, channels); @@ -578,7 +579,7 @@ void cpu_upsample_linear_channels_last( auto input = input_.contiguous(channels_last_memory_format); auto output = output_.contiguous(channels_last_memory_format); - auto input_data = input.data_ptr(); + auto input_data = input.const_data_ptr(); auto output_data = output.data_ptr(); int64_t num_batches = input_sizes[0]; @@ -619,10 +620,10 @@ void cpu_upsample_linear_channels_last( scalar_t* out = output_data + n * output_slice_size + oh * output_width * channels + ow * channels; - scalar_t* i00 = input_indexr(n, ih0, iw0); - scalar_t* i01 = input_indexr(n, ih0, iw1); - scalar_t* i10 = input_indexr(n, ih1, iw0); - scalar_t* i11 = input_indexr(n, ih1, iw1); + const scalar_t* i00 = input_indexr(n, ih0, iw0); + const scalar_t* i01 = input_indexr(n, ih0, iw1); + const scalar_t* i10 = input_indexr(n, ih1, iw0); + const scalar_t* i11 = input_indexr(n, ih1, iw1); opmath_t w00 = h0lambda * w0lambda; opmath_t w01 = h0lambda * w1lambda; opmath_t w10 = h1lambda * w0lambda; @@ -673,14 +674,14 @@ void cpu_upsample_linear_channels_last( scalar_t* out = output_data + n * output_slice_size + od * output_height * output_width * channels + oh * output_width * channels + ow * channels; - scalar_t* i000 = input_indexr(n, id0, ih0, iw0); - scalar_t* i001 = input_indexr(n, id0, ih0, iw1); - scalar_t* i010 = input_indexr(n, id0, ih1, iw0); - scalar_t* i011 = input_indexr(n, id0, ih1, iw1); - scalar_t* i100 = input_indexr(n, id1, ih0, iw0); - scalar_t* i101 = input_indexr(n, id1, ih0, iw1); - scalar_t* i110 = input_indexr(n, id1, ih1, iw0); - scalar_t* i111 = input_indexr(n, id1, ih1, iw1); + const scalar_t* i000 = input_indexr(n, id0, ih0, iw0); + const scalar_t* i001 = input_indexr(n, id0, ih0, iw1); + const scalar_t* i010 = input_indexr(n, id0, ih1, iw0); + const scalar_t* i011 = input_indexr(n, id0, ih1, iw1); + const scalar_t* i100 = input_indexr(n, id1, ih0, iw0); + const scalar_t* i101 = input_indexr(n, id1, ih0, iw1); + const scalar_t* i110 = input_indexr(n, id1, ih1, iw0); + const scalar_t* i111 = input_indexr(n, id1, ih1, iw1); opmath_t w000 = d0lambda * h0lambda * w0lambda; opmath_t w001 = d0lambda * h0lambda * w1lambda; opmath_t w010 = d0lambda * h1lambda * w0lambda; @@ -741,30 +742,30 @@ struct HelperInterpBase { } } + // This is a helper function for _compute_index_ranges_weights method that computes + // source two int64 scalars index min and size and a list weights (of size max_interp_size) + // for interpolation with antialiasing=true mode. It returns the maximal weights value template - static inline scalar_t _compute_weights_aa( + static inline scalar_t _compute_indices_min_size_weights_aa( const int64_t i, const int64_t input_size, const scalar_t scale, const scalar_t support, scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn, - int64_t& xmin, int64_t& xsize, bool antialias, double align_corners_delta + int64_t& xmin, int64_t& xsize ) { - // align_corners_delta is 0.5 for uint8 and align_corners=true and antialias=false - // is 0.0 otherwise - scalar_t center = scale * (i + 0.5 - align_corners_delta); + scalar_t center = scale * (i + 0.5); scalar_t total_w = 0.0; - scalar_t invscale = (scale >= 1.0 && antialias) ? 1.0 / scale : 1.0; + scalar_t invscale = (scale >= 1.0) ? 1.0 / scale : 1.0; xmin = std::max( - static_cast(center - support + 0.5 + align_corners_delta), static_cast(0)); + static_cast(center - support + 0.5), static_cast(0)); xsize = std::min( - static_cast(center + support + 0.5 + align_corners_delta), input_size) - xmin; - + static_cast(center + support + 0.5), input_size) - xmin; // There are rare cases when due to precision xsize can be larger than max_interp_size by one. // We have to clip the value xsize = std::clamp(xsize, static_cast(0), max_interp_size); int64_t j = 0; for (; j < xsize; j++) { - scalar_t w = filter_fn((j + xmin - center + 0.5 - align_corners_delta) * invscale); + scalar_t w = filter_fn((j + xmin - center + 0.5) * invscale); wt_ptr[j] = w; total_w += w; } @@ -783,10 +784,72 @@ struct HelperInterpBase { return wt_max; } - // Note [ Support for antialias=False as a subcase of antilias=True ] + // This is a helper function for _compute_index_ranges_weights method that computes + // source two int64 scalars index min and size and a list weights (of size max_interp_size) + // for interpolation with antialiasing=false mode. It returns the maximal weights value. + // This function is templated with scalar_t for type of scale and weights but is only used for + // bilinear/bicubic modes on uint8 input and antialiasing=false (in this case scalar_t is double). + // For float input types we are using upsample_generic_Nd_kernel_impl and compute_indices_weights methods + template + static inline scalar_t _compute_indices_min_size_weights( + const int64_t i, const int64_t input_size, const scalar_t scale, + scalar_t* wt_ptr, const int64_t max_interp_size, aa_filter_fn_t filter_fn, + bool align_corners, int64_t& index_min, int64_t& index_size + ) { + // Notes. We do not use opmath_t in this method as f16 and other smaller float types are not routed here. + // Typical usage of this method is with scalar_t = double when computing indices and weights for uint8 input + // The code below partly adapts indices and lambda computation from compute_indices_weights method and + // index_min/index_size from _compute_indices_min_size_weights_aa + + bool cubic = max_interp_size > 2; + const auto real_input_index = area_pixel_compute_source_index( + scale, i, align_corners, /*cubic=*/cubic); + + scalar_t lambda; + int64_t input_index; + guard_index_and_lambda(real_input_index, input_size, input_index, lambda); + + const auto support = static_cast(max_interp_size * 0.5); + const auto unbound_index_min = input_index - support + 1; + const auto unbound_index_max = input_index + support + 1; + index_min = std::max(unbound_index_min, static_cast(0)); + index_size = std::min(unbound_index_max, input_size) - index_min; + // There are rare cases when due to precision xsize can be larger than max_interp_size by one. + // We have to clip the value + index_size = std::clamp(index_size, static_cast(0), max_interp_size); + + // Below the weights are computed using filter_fn and accumulating values for indices being out of bounds + // For example, for bicubic mode for output index i = 0, we have input_index = -1, + // then we have unbound_index_min = -2 and unbound_index_max = 1 => unbounded input indices are [-2, -1, 0, 1] and + // valid input indices will be [0, 1] + // For unbounded input indices we compute four non-zero weights values [w0, w1, w2, w3] and as only two weights can + // be used with valid input indcies, we accumulate values in the following way: [w0 + w1 + w2, w3, 0.0, 0.0] + // This is equivalent to the float path which would compute indices as [0, 0, 0, 1] and weights as [w0, w1, w2, s3]. + // A similar accumulation should done for unbounded indices larger than input size. + auto w_index = 0; + scalar_t wt_max = 0.0; + for (const auto j : c10::irange(max_interp_size)) { + // initialize weights value as we will accumulate below + wt_ptr[j] = 0.0; + + scalar_t w = filter_fn(static_cast(j + 1 - support) - lambda); + if (unbound_index_min + j <= 0) { + w_index = 0; + } else if (unbound_index_min + j >= input_size - 1) { + w_index = index_size - 1; + } + wt_ptr[w_index] += w; + wt_max = std::max(wt_max, wt_ptr[w_index]); + w_index++; + } + + return wt_max; + } + + // Note [ Support for antialias=False as a subcase of antialias=True ] // This function was originally written with the hard assumption that - // antialias=True (hence the aa in the name). It was later extended to support - // antialias=False. The only difference between aa and no-aa is in how the + // antialias=True and it was later extended to support antialias=False. + // The only difference between aa and no-aa is in how the // weights and indices are computed (and their number). In aa their number is // variable but with no-aa, they're fixed to interp_size. The same "filters" // can be used otherwise. HOWEVER, support for antialias=False here may not be @@ -794,10 +857,10 @@ struct HelperInterpBase { // indices, but this can be optimized further when aa=False since we know // their actual dimensions. template - static inline std::tuple, int, scalar_t> _compute_indices_weights_aa( + static inline std::tuple, int, scalar_t> _compute_index_ranges_weights( int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, scalar_t scale, - int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, double align_corners_delta + int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_corners ) { std::vector output; @@ -845,24 +908,35 @@ struct HelperInterpBase { scalar_t wt_max = 0.0; for (const auto i : c10::irange(output_size)) { - int64_t xmin, xmax; - auto wt_max_i = HelperInterpBase::_compute_weights_aa( - i, - input_size, - scale, - support, - wt_ptr + i * max_interp_size, - max_interp_size, - aa_filter_fn, - xmin, - xmax, - antialias, - align_corners_delta); - + int64_t xmin, xsize; + scalar_t wt_max_i; + if (antialias) { + wt_max_i = HelperInterpBase::_compute_indices_min_size_weights_aa( + i, + input_size, + scale, + support, + wt_ptr + i * max_interp_size, + max_interp_size, + aa_filter_fn, + xmin, + xsize); + } else { + wt_max_i = HelperInterpBase::_compute_indices_min_size_weights( + i, + input_size, + scale, + wt_ptr + i * max_interp_size, + max_interp_size, + aa_filter_fn, + align_corners, + xmin, + xsize); + } wt_max = std::max(wt_max, wt_max_i); idx_ptr_xmin[i] = xmin * stride; - idx_ptr_size[i] = xmax; + idx_ptr_size[i] = xsize; idx_ptr_stride[i] = stride; wt_idx_ptr[i] = i * max_interp_size * weight_index_stride; } @@ -878,7 +952,7 @@ struct HelperInterpBase { uint8 in basic_loop_aa_horizontal (and vertical) In essence the idea is to avoid a multiplication between a float (the - weight) and an int (the pixel value) and instead run a multpilication between + weight) and an int (the pixel value) and instead run a multiplication between 2 ints: ```py @@ -911,7 +985,7 @@ struct HelperInterpBase { = what we wanted */ template - static inline std::tuple, int, unsigned int> _compute_indices_int16_weights_aa( + static inline std::tuple, int, unsigned int> _compute_index_ranges_int16_weights( int64_t input_size, int64_t output_size, int64_t stride, int64_t ndims, int64_t reshape_dim, bool align_corners, const c10::optional opt_scale, int interp_size, aa_filter_fn_t aa_filter_fn, bool antialias, bool align_i32=false @@ -921,10 +995,9 @@ struct HelperInterpBase { input_size, output_size, align_corners, opt_scale); std::vector indices_weights; - auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0; double wt_max; - std::tie(indices_weights, interp_size, wt_max) = HelperInterpBase::_compute_indices_weights_aa( - input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners_delta); + std::tie(indices_weights, interp_size, wt_max) = HelperInterpBase::_compute_index_ranges_weights( + input_size, output_size, stride, ndims, reshape_dim, scale, interp_size, aa_filter_fn, antialias, align_corners); // Rescale float weights to int16 and compute weights precision auto weights_f64 = indices_weights[3]; @@ -1008,8 +1081,8 @@ struct HelperInterpNearest : public HelperInterpBase { HelperInterpNearest::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size); - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, scalar_type, "compute_indices_weights_nearest", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, scalar_type, "compute_indices_weights_nearest", [&] { using opmath_t = at::opmath_type; opmath_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -1059,9 +1132,10 @@ struct HelperInterpNearestExact : public HelperInterpNearest { HelperInterpNearest::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpNearest::interp_size); - AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_nearest", [&] { - scalar_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, scalar_type, "compute_indices_weights_nearest", [&] { + using opmath_t = at::opmath_type; + opmath_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); auto input_index_ptr = output[0].data_ptr(); int64_t input_index; @@ -1071,7 +1145,6 @@ struct HelperInterpNearestExact : public HelperInterpNearest { // index_f32 = (output_index + 0.5) * scale - 0.5 // input_index = round(index_f32) // Same as Pillow and Scikit-Image/Scipy ndi.zoom - using opmath_t = at::opmath_type; for (const auto i : c10::irange(output_size)) { const auto real_input_index = area_pixel_compute_source_index( @@ -1108,8 +1181,8 @@ struct HelperInterpLinear : public HelperInterpBase { std::vector output; HelperInterpLinear::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpLinear::interp_size); - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, scalar_type, "compute_indices_weights_linear", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, scalar_type, "compute_indices_weights_linear", [&] { using opmath_t = at::opmath_type; opmath_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -1149,7 +1222,7 @@ struct HelperInterpLinear : public HelperInterpBase { return 0.0; } - static inline std::vector compute_indices_weights_aa( + static inline std::vector compute_index_ranges_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, @@ -1163,17 +1236,14 @@ struct HelperInterpLinear : public HelperInterpBase { std::vector indices_weights; AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_aa", [&] { + scalar_type, "compute_index_ranges_weights", [&] { scalar_t scale = area_pixel_compute_scale( input_size, output_size, align_corners, opt_scale); auto interp_size = HelperInterpLinear::interp_size; - int unused; - scalar_t unused_2; - auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0; - std::tie(indices_weights, unused, unused_2) = HelperInterpLinear::_compute_indices_weights_aa( + indices_weights = std::get<0>(HelperInterpLinear::_compute_index_ranges_weights( input_size, output_size, stride, @@ -1183,13 +1253,13 @@ struct HelperInterpLinear : public HelperInterpBase { interp_size, &HelperInterpLinear::aa_filter, /*antialias=*/antialias, - /*align_corners_delta=*/align_corners_delta); + /*align_corners=*/align_corners)); } ); return indices_weights; } - static inline std::tuple, int, unsigned int> compute_indices_int16_weights_aa( + static inline std::tuple, int, unsigned int> compute_index_ranges_int16_weights( int64_t input_size, int64_t output_size, int64_t stride, @@ -1203,7 +1273,7 @@ struct HelperInterpLinear : public HelperInterpBase { auto interp_size = HelperInterpLinear::interp_size; auto fn = HelperInterpLinear::aa_filter; - return HelperInterpLinear::_compute_indices_int16_weights_aa( + return HelperInterpLinear::_compute_index_ranges_int16_weights( input_size, output_size, stride, ndims, reshape_dim, align_corners, opt_scale, interp_size, fn, antialias, align_i32); } @@ -1233,8 +1303,8 @@ struct HelperInterpCubic : public HelperInterpBase { HelperInterpCubic::init_indices_weights( scalar_type, output, output_size, ndims, reshape_dim, HelperInterpCubic::interp_size); - AT_DISPATCH_FLOATING_TYPES_AND( - ScalarType::BFloat16, scalar_type, "compute_indices_weights_cubic", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, scalar_type, "compute_indices_weights_cubic", [&] { using opmath_t = at::opmath_type; opmath_t scale = area_pixel_compute_scale(input_size, output_size, align_corners, opt_scale); @@ -1286,7 +1356,7 @@ struct HelperInterpCubic : public HelperInterpBase { return 0.0; } - static inline std::vector compute_indices_weights_aa( + static inline std::vector compute_index_ranges_weights( at::ScalarType scalar_type, int64_t input_size, int64_t output_size, @@ -1300,17 +1370,14 @@ struct HelperInterpCubic : public HelperInterpBase { std::vector indices_weights; AT_DISPATCH_FLOATING_TYPES( - scalar_type, "compute_indices_weights_aa", [&] { + scalar_type, "compute_index_ranges_weights", [&] { scalar_t scale = area_pixel_compute_scale( input_size, output_size, align_corners, opt_scale); auto interp_size = HelperInterpCubic::interp_size; - int unused; - scalar_t unused_2; - auto align_corners_delta = (align_corners && !antialias) ? 0.5 : 0.0; - std::tie(indices_weights, unused, unused_2) = HelperInterpCubic::_compute_indices_weights_aa( + indices_weights = std::get<0>(HelperInterpCubic::_compute_index_ranges_weights( input_size, output_size, stride, @@ -1320,13 +1387,13 @@ struct HelperInterpCubic : public HelperInterpBase { interp_size, &HelperInterpCubic::aa_filter, /*antialias=*/antialias, - /*align_corners_delta*/align_corners_delta); + /*align_corners=*/align_corners)); } ); return indices_weights; } - static inline std::tuple, int, unsigned int> compute_indices_int16_weights_aa( + static inline std::tuple, int, unsigned int> compute_index_ranges_int16_weights( int64_t input_size, int64_t output_size, int64_t stride, @@ -1342,7 +1409,7 @@ struct HelperInterpCubic : public HelperInterpBase { // We have to use the -0.75 constant when aa is False so that this uint8 // path is as close as possible to float results. auto fn = antialias ? HelperInterpCubic::aa_filter : HelperInterpCubic::aa_filter; - return HelperInterpCubic::_compute_indices_int16_weights_aa( + return HelperInterpCubic::_compute_index_ranges_int16_weights( input_size, output_size, stride, ndims, reshape_dim, align_corners, opt_scale, interp_size, fn, antialias, align_i32); } @@ -1407,11 +1474,11 @@ void upsample_generic_Nd_kernel_impl( config.check_all_same_dtype(false) .declare_static_dtype_and_device(input.scalar_type(), input.device()) .add_output(output) - .add_input(restrided_input); + .add_const_input(restrided_input); for (auto & idx_weight: indices_weights) { for (auto& tensor : idx_weight) { - config.add_input(tensor); + config.add_const_input(tensor); } } @@ -1419,14 +1486,14 @@ void upsample_generic_Nd_kernel_impl( if (interp_size > 1) { // Nearest also supports uint8 tensor, so need to handle it separately - AT_DISPATCH_FLOATING_TYPES_AND( - at::ScalarType::BFloat16, iter.dtype(), "upsample_generic_Nd", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2( + kBFloat16, kHalf, iter.dtype(), "upsample_generic_Nd", [&] { // MSVC can not catch constexpr int interp_size here constexpr int mode = F::interp_size; cpu_upsample_generic(iter); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, + AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, iter.dtype(), "upsample_generic_Nd", [&] { constexpr int mode = F::interp_size; cpu_upsample_generic(iter); @@ -1504,7 +1571,7 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim( // This is a special branch to provide uint8 dtype support for bilinear and bicubic modes only TORCH_INTERNAL_ASSERT(F::interp_size == 2 || F::interp_size == 4); std::tie(indices_weights, unused, weights_precision) = - F::compute_indices_int16_weights_aa( + F::compute_index_ranges_int16_weights( input.size(interp_dim), oshape[interp_dim], input.stride(interp_dim) * input.element_size(), input.dim(), interp_dim, align_corners, scales[interp_dim - 2], @@ -1512,7 +1579,7 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim( TORCH_INTERNAL_ASSERT(weights_precision > 0); } else { indices_weights = - F::compute_indices_weights_aa( + F::compute_index_ranges_weights( input_scalar_type, input.size(interp_dim), oshape[interp_dim], input.stride(interp_dim) * input.element_size(), input.dim(), interp_dim, align_corners, scales[interp_dim - 2], @@ -1523,10 +1590,10 @@ void _separable_upsample_generic_Nd_kernel_impl_single_dim( config.check_all_same_dtype(false) .declare_static_dtype_and_device(input.scalar_type(), input.device()) .add_output(output) - .add_input(restrided_input); + .add_const_input(restrided_input); for (auto& tensor : indices_weights) { - config.add_input(tensor); + config.add_const_input(tensor); } auto iter = config.build(); @@ -1662,7 +1729,7 @@ void upsample_nearest2d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (_use_vectorized_kernel_cond_2d(output, input)) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, + AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_h, scales_w}); }); @@ -1678,7 +1745,7 @@ void _upsample_nearest_exact2d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (_use_vectorized_kernel_cond_2d(output, input)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest2d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_h, scales_w}); }); } else { @@ -1694,7 +1761,7 @@ void upsample_nearest3d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (_use_vectorized_kernel_cond_3d(output, input)) { - AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Byte, at::ScalarType::BFloat16, + AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_d, scales_h, scales_w}); }); @@ -1711,7 +1778,7 @@ void _upsample_nearest_exact3d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (_use_vectorized_kernel_cond_3d(output, input)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::Byte, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND3(kByte, kBFloat16, kHalf, input.scalar_type(), "upsample_nearest3d_channels_last", [&] { cpu_upsample_nearest_channels_last(output, input, {scales_d, scales_h, scales_w}); }); } else { @@ -1743,7 +1810,7 @@ void upsample_bilinear2d_kernel_impl_float( // That's not the case for masks though (C == 1), which strongly benefit from // using the generic kernel. if ((_use_vectorized_kernel_cond_2d(output, input)) || (at::get_num_threads() == 1 && input.size(1) == 3)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_bilinear2d_channels_last", [&] { cpu_upsample_linear_channels_last(output, input, align_corners, {scales_h, scales_w}); }); } else { @@ -1812,7 +1879,7 @@ void upsample_trilinear3d_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if ((_use_vectorized_kernel_cond_3d(output, input))) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, input.scalar_type(), "upsample_trilinear3d_channels_last", [&] { cpu_upsample_linear_channels_last(output, input, align_corners, {scales_d, scales_h, scales_w}); }); } else { @@ -1890,7 +1957,7 @@ void cpu_upsample_genNd_backward_aa( auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); auto input_sizes = grad_input.sizes().vec(); auto output_sizes = grad_output.sizes().vec(); @@ -1939,7 +2006,7 @@ void cpu_upsample_genNd_backward_aa( aa_filter_fn_t filter_fn = &F::aa_filter; for (const auto oh : c10::irange(output_height)) { - F::_compute_weights_aa( + F::_compute_indices_min_size_weights_aa( oh, input_height, height_scale, @@ -1948,12 +2015,10 @@ void cpu_upsample_genNd_backward_aa( interp_height, filter_fn, ymin, - ysize, - /*antialias=*/true, - /*align_corners_delta=*/0.0); + ysize); for (const auto ow : c10::irange(output_width)) { - F::_compute_weights_aa( + F::_compute_indices_min_size_weights_aa( ow, input_width, width_scale, @@ -1962,9 +2027,7 @@ void cpu_upsample_genNd_backward_aa( interp_width, filter_fn, xmin, - xsize, - /*antialias=*/true, - /*align_corners_delta=*/0.0); + xsize); for (const auto c : c10::irange(begin, end)) { scalar_t grad_output_value = diff --git a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h index 2a996cfa4f1c9..726a83c20963d 100644 --- a/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h +++ b/aten/src/ATen/native/cpu/UpSampleKernelAVXAntialias.h @@ -66,7 +66,7 @@ at::Tensor unpack_rgb(const at::Tensor& packed_tensor) { // into as 32 bits. This generalizes to num_channels <= 4 and also works for // non-channels_last tensors. - const uint8_t* packed = (const uint8_t*)packed_tensor.data_ptr(); + const uint8_t* packed = (const uint8_t*)packed_tensor.const_data_ptr(); auto num_pixels = packed_tensor.size(1) * packed_tensor.size(2); auto num_channels = packed_tensor.size(0); @@ -180,18 +180,18 @@ void ImagingResampleHorizontal( // Although this may not be needed if / when we port all this code to use // Vec.h since this would potentially give us another fall-back implem - const int16_t* kk = (int16_t*)(horiz_indices_weights[3].data_ptr()); + const int16_t* kk = (int16_t*)(horiz_indices_weights[3].const_data_ptr()); auto xout = unpacked_output.size(2); auto yout = unpacked_output.size(1); auto xin = unpacked_input.size(2); TORCH_INTERNAL_ASSERT(num_channels == unpacked_input.size(0)); - const int64_t* idx_ptr_xmin = horiz_indices_weights[0].data_ptr(); - const int64_t* idx_ptr_size = horiz_indices_weights[1].data_ptr(); + const int64_t* idx_ptr_xmin = horiz_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = horiz_indices_weights[1].const_data_ptr(); uint8_t* unpacked_output_p = unpacked_output.data_ptr(); - const uint8_t* unpacked_input_p = unpacked_input.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); int64_t yy = 0; auto xout_stride = xout * num_channels; @@ -255,13 +255,13 @@ void ImagingResampleVertical( // basic_loop_aa_vertical) // Although this may not be needed if / when we port all this code to use // Vec.h since this would potentially give us another fall-back implem - const int16_t* kk = (int16_t*)(vert_indices_weights[3].data_ptr()); + const int16_t* kk = (int16_t*)(vert_indices_weights[3].const_data_ptr()); - const int64_t* idx_ptr_xmin = vert_indices_weights[0].data_ptr(); - const int64_t* idx_ptr_size = vert_indices_weights[1].data_ptr(); + const int64_t* idx_ptr_xmin = vert_indices_weights[0].const_data_ptr(); + const int64_t* idx_ptr_size = vert_indices_weights[1].const_data_ptr(); uint8_t* unpacked_output_p = unpacked_output.data_ptr(); - const uint8_t* unpacked_input_p = unpacked_input.data_ptr(); + const uint8_t* unpacked_input_p = unpacked_input.const_data_ptr(); auto xout = unpacked_output.size(2); auto yout = unpacked_output.size(1); @@ -296,7 +296,7 @@ void ImagingResampleVertical( // [ Weights computation for uint8_t and multiplication trick ] // For details on how the AVX kernels are implemented, see // https://gist.github.com/NicolasHug/47c97d731f05eaad5694c173849b86f5 -// See also [ Support for antialias=False as a subcase of antilias=True ] to +// See also [ Support for antialias=False as a subcase of antialias=True ] to // learn more about how the antialias=False case is computed. The same holds // here: all these kernels are general enough to handle an arbitrary number of // weights, but when aa=False they could be optimized further. @@ -344,7 +344,7 @@ void upsample_avx_bilinear_bicubic_uint8( int interp_dim = 3; auto stride = (skip_unpacking) ? num_channels : 4; std::tie(horiz_indices_weights, ksize_horiz, horiz_weights_precision) = - F::compute_indices_int16_weights_aa( + F::compute_index_ranges_int16_weights( /*input_size=*/xin, /*output_size=*/xout, /*stride=*/stride, @@ -360,7 +360,7 @@ void upsample_avx_bilinear_bicubic_uint8( int interp_dim = 2; auto stride = (skip_unpacking) ? num_channels * xout : 4 * xout; std::tie(vert_indices_weights, ksize_vert, vert_weights_precision) = - F::compute_indices_int16_weights_aa( + F::compute_index_ranges_int16_weights( /*input_size=*/yin, /*output_size=*/yout, /*stride=*/stride, @@ -699,7 +699,7 @@ void ImagingResampleHorizontalConvolution8u4x( // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct - // value which was preveiously computed by another line. In other words, it means that we can not overwrite + // value which was previously computed by another line. In other words, it means that we can not overwrite // it by simply writing 4 bytes from the register to the output. We'll do the following: // v----------| // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] @@ -1040,7 +1040,7 @@ void ImagingResampleHorizontalConvolution8u( // Memcpy 4-bytes is faster than 3-bytes and this is a boundary case when we want to write // 4 bytes (R G B | X) to the output buffer (X1 X2 X3 | R1). // The 4th byte in the register (X) has a garbage value and 4th byte in the output buffer (R1) has a correct - // value which was preveiously computed by another line. In other words, it means that we can not overwrite + // value which was previously computed by another line. In other words, it means that we can not overwrite // it by simply writing 4 bytes from the register to the output. We'll do the following: // v----------| // Output = [... X1 X2 X3 | R1 G1 B1 R2 ...] diff --git a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp index 0e2511394ec75..b97b5cefee2c8 100644 --- a/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp +++ b/aten/src/ATen/native/cpu/UpSampleMoreKernel.cpp @@ -14,11 +14,13 @@ namespace { using scale_t = std::vector>; -template +template , + typename std::enable_if_t || !std::is_same::value, int> = 0> void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size) { - TORCH_CHECK((std::is_same::value), + TORCH_CHECK((std::is_same::value), "acc data type of Upsample backward should be same as scalar_t for float or double on CPU.") - using Vec = vec::Vectorized; + using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec gin_vec = Vec::loadu(gin + d) + Vec::loadu(gout + d); @@ -29,14 +31,16 @@ void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size) } } -template <> -void inline nearest_channels_last_acc(float* gin, BFloat16* gout, int64_t size) { - using bVec = vec::Vectorized; - using fVec = vec::Vectorized; +template , + typename std::enable_if_t && std::is_same::value, int> = 0> +void inline nearest_channels_last_acc(acc_t* gin, scalar_t* gout, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec gout_bvec = bVec::loadu(gout + d); - auto [gout_fvec0, gout_fvec1] = convert_bfloat16_float(gout_bvec); + auto [gout_fvec0, gout_fvec1] = convert_to_float(gout_bvec); fVec gin_fvec0 = fVec::loadu(gin + d) + gout_fvec0; fVec gin_fvec1 = fVec::loadu(gin + d + fVec::size()) + gout_fvec1; gin_fvec0.store(gin + d); @@ -47,11 +51,13 @@ void inline nearest_channels_last_acc(float* gin, BFloat16* gout, int64_t size) } } -template -void inline linear_channels_last_acc(acc_t* gin, scalar_t* gout, acc_t w, int64_t size) { - TORCH_CHECK((std::is_same::value), +template , + typename std::enable_if_t || !std::is_same::value, int> = 0> +void inline linear_channels_last_acc(acc_t* gin, const scalar_t* gout, acc_t w, int64_t size) { + TORCH_CHECK((std::is_same::value), "acc data type of Upsample backward should be same as scalar_t for float or double on CPU.") - using Vec = vec::Vectorized; + using Vec = Vectorized; int64_t d = 0; for (; d < size - (size % Vec::size()); d += Vec::size()) { Vec gin_vec = Vec::loadu(gin + d) + Vec(w) * Vec::loadu(gout + d); @@ -62,14 +68,16 @@ void inline linear_channels_last_acc(acc_t* gin, scalar_t* gout, acc_t w, int64_ } } -template <> -void inline linear_channels_last_acc(float* gin, BFloat16* gout, float w, int64_t size) { - using bVec = vec::Vectorized; - using fVec = vec::Vectorized; +template , + typename std::enable_if_t && std::is_same::value, int> = 0> +void inline linear_channels_last_acc(acc_t* gin, const scalar_t* gout, acc_t w, int64_t size) { + using bVec = Vectorized; + using fVec = Vectorized; int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec gout_bvec = bVec::loadu(gout + d); - auto [gout_fvec0, gout_fvec1] = convert_bfloat16_float(gout_bvec); + auto [gout_fvec0, gout_fvec1] = convert_to_float(gout_bvec); fVec gin_fvec0 = fVec::loadu(gin + d) + fVec(w) * gout_fvec0; fVec gin_fvec1 = fVec::loadu(gin + d + fVec::size()) + fVec(w) * gout_fvec1; gin_fvec0.store(gin + d); @@ -91,7 +99,7 @@ void cpu_upsample_nearest_backward( auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); auto input_sizes = grad_input.sizes().vec(); auto output_sizes = grad_output.sizes().vec(); @@ -228,7 +236,7 @@ void cpu_upsample_nearest_backward_channels_last( auto grad_output = grad_output_.contiguous(channels_last_memory_format); auto grad_input = grad_input_.contiguous(channels_last_memory_format); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); auto input_sizes = grad_input.sizes().vec(); @@ -262,7 +270,7 @@ void cpu_upsample_nearest_backward_channels_last( int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[0]); for (const auto ow : c10::irange(output_width)) { int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[1]); - scalar_t* grad_output_ptr = grad_output_data + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_height * output_width + oh * output_width + ow) * channels; opmath_t* buffer_ptr = acc_data_ptr + input_offset + (ih * input_width + iw) * channels; nearest_channels_last_acc(buffer_ptr, grad_output_ptr, channels); @@ -295,7 +303,7 @@ void cpu_upsample_nearest_backward_channels_last( int64_t ih = nearest_idx_fn(oh, input_height, output_height, scales[1]); for (int64_t ow = 0; ow < output_width; ow++) { int64_t iw = nearest_idx_fn(ow, input_width, output_width, scales[2]); - scalar_t* grad_output_ptr = grad_output_data + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width + od * output_height * output_width + oh * output_width + ow) * channels; @@ -330,7 +338,7 @@ void upsample_nearest1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest1d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); }); } @@ -339,7 +347,7 @@ void _upsample_nearest_exact1d_backward_kernel_impl( const Tensor& grad_input, const Tensor& grad_output, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact1d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_w}); }); } @@ -350,11 +358,11 @@ void upsample_nearest2d_backward_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest2d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); }); } @@ -366,11 +374,11 @@ void _upsample_nearest_exact2d_backward_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward_cl", [&] { cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_h, scales_w}); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact2d_backward", [&] { cpu_upsample_nearest_backward(grad_input, grad_output, {scales_h, scales_w}); }); } @@ -382,9 +390,15 @@ void upsample_nearest3d_backward_kernel_impl( c10::optional scales_d, c10::optional scales_h, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); - }); + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest3d_backward_cl", [&] { + cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_nearest3d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); + } } void _upsample_nearest_exact3d_backward_kernel_impl( @@ -393,9 +407,15 @@ void _upsample_nearest_exact3d_backward_kernel_impl( c10::optional scales_d, c10::optional scales_h, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] { - cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); - }); + if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward_cl", [&] { + cpu_upsample_nearest_backward_channels_last(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); + } else { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "_upsample_nearest_exact3d_backward", [&] { + cpu_upsample_nearest_backward(grad_input, grad_output, {scales_d, scales_h, scales_w}); + }); + } } template @@ -410,7 +430,7 @@ void cpu_upsample_linear_backward( auto grad_output = grad_output_.contiguous(); auto grad_input = grad_input_.contiguous(); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); auto input_sizes = grad_input.sizes().vec(); auto output_sizes = grad_output.sizes().vec(); @@ -587,7 +607,7 @@ void cpu_upsample_linear_backward_channels_last( auto grad_output = grad_output_.contiguous(channels_last_memory_format); auto grad_input = grad_input_.contiguous(channels_last_memory_format); - auto grad_output_data = grad_output.data_ptr(); + auto grad_output_data = grad_output.const_data_ptr(); auto grad_input_data = grad_input.mutable_data_ptr(); auto input_sizes = grad_input.sizes().vec(); @@ -635,7 +655,7 @@ void cpu_upsample_linear_backward_channels_last( for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); - scalar_t* grad_output_ptr = grad_output_data + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_height * output_width + oh * output_width + ow) * channels; linear_channels_last_acc(input_indexr(n, ih0, iw0, input_offset), grad_output_ptr, h0lambda * w0lambda, channels); /* i00 */ linear_channels_last_acc(input_indexr(n, ih0, iw1, input_offset), grad_output_ptr, h0lambda * w1lambda, channels); /* i01 */ @@ -687,7 +707,7 @@ void cpu_upsample_linear_backward_channels_last( for (const auto ow : c10::irange(output_width)) { compute_source_index_and_lambda( iw0, iw1, w0lambda, w1lambda, width_scale, ow, input_width, output_width, align_corners); - scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width + + const scalar_t* grad_output_ptr = grad_output_data + (n * output_depth * output_height * output_width + od * output_height * output_width + oh * output_width + ow) * channels; linear_channels_last_acc(input_indexr(n, id0, ih0, iw0, input_offset), grad_output_ptr, d0lambda * h0lambda * w0lambda, channels); /* i000 */ linear_channels_last_acc(input_indexr(n, id0, ih0, iw1, input_offset), grad_output_ptr, d0lambda * h0lambda * w1lambda, channels); /* i001 */ @@ -726,7 +746,7 @@ void upsample_linear1d_backward_kernel_impl( const Tensor& grad_output, bool align_corners, c10::optional scales_w) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_linear1d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_linear1d_backward", [&] { cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_w}); }); } @@ -738,11 +758,11 @@ void upsample_bilinear2d_backward_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward_channels_last", [&] { cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_h, scales_w}); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_bilinear2d_backward", [&] { cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_h, scales_w}); }); } @@ -756,11 +776,11 @@ void upsample_trilinear3d_backward_kernel_impl( c10::optional scales_h, c10::optional scales_w) { if (grad_output.is_contiguous(at::MemoryFormat::ChannelsLast3d)) { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward_channels_last", [&] { cpu_upsample_linear_backward_channels_last(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); }); } else { - AT_DISPATCH_FLOATING_TYPES_AND(at::ScalarType::BFloat16, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kBFloat16, kHalf, grad_output.scalar_type(), "upsample_trilinear3d_backward", [&] { cpu_upsample_linear_backward(grad_input, grad_output, align_corners, {scales_d, scales_h, scales_w}); }); } diff --git a/aten/src/ATen/native/cpu/WeightNormKernel.cpp b/aten/src/ATen/native/cpu/WeightNormKernel.cpp index cace911114efe..8d483d24636ed 100644 --- a/aten/src/ATen/native/cpu/WeightNormKernel.cpp +++ b/aten/src/ATen/native/cpu/WeightNormKernel.cpp @@ -70,8 +70,7 @@ inline void sum_norm_per_row( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec v_bvec = bVec::loadu(v_ptr + d); - fVec v_fvec0, v_fvec1; - std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec); fVec out_fvec0 = fVec::loadu(out_ptr + d) + v_fvec0 * v_fvec0; fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + v_fvec1 * v_fvec1; @@ -109,8 +108,7 @@ inline void apply_norm_per_row( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec v_bvec = bVec::loadu(v_ptr + d); - fVec v_fvec0, v_fvec1; - std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec); fVec w_fvec0 = fVec::loadu(a_ptr + d) * v_fvec0; fVec w_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * v_fvec1; @@ -249,11 +247,9 @@ inline void sum_product_per_row( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d); - fVec grad_w_fvec0, grad_w_fvec1; - std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec); + auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec); bVec v_bvec = bVec::loadu(v_ptr + d); - fVec v_fvec0, v_fvec1; - std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec); fVec out_fvec0 = fVec::loadu(out_ptr + d) + grad_w_fvec0 * v_fvec0; fVec out_fvec1 = fVec::loadu(out_ptr + d + fVec::size()) + grad_w_fvec1 * v_fvec1; @@ -298,11 +294,9 @@ inline void apply_per_row_backward( int64_t d = 0; for (; d < size - (size % bVec::size()); d += bVec::size()) { bVec grad_w_bvec = bVec::loadu(grad_w_ptr + d); - fVec grad_w_fvec0, grad_w_fvec1; - std::tie(grad_w_fvec0, grad_w_fvec1) = convert_bfloat16_float(grad_w_bvec); + auto [grad_w_fvec0, grad_w_fvec1] = convert_bfloat16_float(grad_w_bvec); bVec v_bvec = bVec::loadu(v_ptr + d); - fVec v_fvec0, v_fvec1; - std::tie(v_fvec0, v_fvec1) = convert_bfloat16_float(v_bvec); + auto [v_fvec0, v_fvec1] = convert_bfloat16_float(v_bvec); fVec grad_v_fvec0 = fVec::loadu(a_ptr + d) * grad_w_fvec0 - fVec::loadu(b_ptr + d) * v_fvec0; fVec grad_v_fvec1 = fVec::loadu(a_ptr + d + fVec::size()) * grad_w_fvec1 diff --git a/aten/src/ATen/native/cpu/avx_mathfun.h b/aten/src/ATen/native/cpu/avx_mathfun.h index 080cd833d3a10..f4fd3b7bc461f 100644 --- a/aten/src/ATen/native/cpu/avx_mathfun.h +++ b/aten/src/ATen/native/cpu/avx_mathfun.h @@ -240,7 +240,7 @@ _PS256_CONST(coscof_p2, 4.166664568298827E-002); _PS256_CONST(cephes_FOPI, 1.27323954473516); // 4 / M_PI -/* evaluation of 8 sines at onces using AVX intrisics +/* evaluation of 8 sines at onces using AVX intrinsics The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to diff --git a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp index 58faea7f51b83..bf007114e78c1 100644 --- a/aten/src/ATen/native/cpu/batch_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/batch_norm_kernel.cpp @@ -34,13 +34,13 @@ void batch_norm_cpu_collect_linear_and_constant_terms( const Tensor& save_mean, const Tensor& save_invstd, const Tensor& running_mean, const Tensor& running_var, bool train, double eps) { - const param_t* weight_data = weight.defined() ? weight.data_ptr() : nullptr; - const param_t* bias_data = bias.defined() ? bias.data_ptr() : nullptr; + const param_t* weight_data = weight.defined() ? weight.const_data_ptr() : nullptr; + const param_t* bias_data = bias.defined() ? bias.const_data_ptr() : nullptr; - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); /// Collect the linear and constant terms regarding the input. /// output(n, c, h, w) @@ -91,7 +91,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, save_mean, save_invstd, running_mean, running_var, train, eps); scalar_t* output_data = output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); // Apply the linear terms to the input, // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c) @@ -143,7 +143,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, save_mean, save_invstd, running_mean, running_var, train, eps); scalar_t* output_data = output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); // Apply the linear terms to the input, // output(n, c, h, w) = input(n, c, h, w) * alpha(c) + beta(c) @@ -185,7 +185,7 @@ batch_norm_cpu_collect_stats_contiguous_impl( int64_t image_size = input.numel() / n_batch / n_channel; int64_t N = input.numel() / n_channel; - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* mean_data = mean.data_ptr(); scalar_t* var_sum_data = var_sum.data_ptr(); @@ -229,7 +229,7 @@ batch_norm_cpu_collect_stats_channels_last_impl( int64_t n_channel = input.size(1); int64_t N = input.numel() / n_channel; - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* mean_data = mean.data_ptr(); scalar_t* var_sum_data = var_sum.data_ptr(); @@ -416,8 +416,8 @@ batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_weight, int64_t image_size = input.numel() / n_batch / n_channel; int64_t N = input.numel() / n_channel; - const scalar_t* grad_output_data = grad_output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* grad_output_data = grad_output.const_data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr() : nullptr; scalar_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; @@ -426,11 +426,11 @@ batch_norm_cpu_backward_contiguous_impl(Tensor& grad_input, Tensor& grad_weight, const bool grad_weight_null = grad_weight_data == nullptr; const bool grad_bias_null = grad_bias_data == nullptr; - auto weight_a = conditional_accessor_1d(weight); - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto weight_a = conditional_accessor_1d(weight); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); // parallel dim reduce on 'channel' at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { @@ -537,22 +537,22 @@ batch_norm_cpu_backward_channels_last_impl(Tensor& grad_input, Tensor& grad_weig int64_t n_channel = input.size(1); int64_t N = input.numel() / n_channel; - const scalar_t* grad_output_data = grad_output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* grad_output_data = grad_output.const_data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr() : nullptr; scalar_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; scalar_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr() : nullptr; - scalar_t* save_mean_data = conditional_data_ptr(save_mean); + const scalar_t* save_mean_data = conditional_data_ptr(save_mean); scalar_t* save_invstd_data = conditional_data_ptr(save_invstd); - scalar_t* running_mean_data = conditional_data_ptr(running_mean); - scalar_t* running_var_data = conditional_data_ptr(running_var); + const scalar_t* running_mean_data = conditional_data_ptr(running_mean); + const scalar_t* running_var_data = conditional_data_ptr(running_var); Tensor weight_ = weight.defined() ? weight : at::ones({n_channel}, input.options()); - const scalar_t* weight_data = weight_.data_ptr(); + const scalar_t* weight_data = weight_.const_data_ptr(); - scalar_t* mean_ptr = nullptr; + const scalar_t* mean_ptr = nullptr; scalar_t* invstd_ptr = nullptr; Tensor invstd = at::empty({0}, input.options()); if (train) { @@ -735,7 +735,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, } scalar_t* output_data = output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); const int64_t loop_size = image_size - (image_size % bVec::size()); at::parallel_for(0, n_batch * n_channel, 1, [&](int64_t begin, int64_t end) { @@ -753,8 +753,7 @@ batch_norm_cpu_contiguous_impl(Tensor& output, const Tensor& input, int64_t d = 0; for (; d < loop_size; d += bVec::size()) { bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); fVec out_fvec0 = data_fvec0 * alpha_fvec + beta_fvec; fVec out_fvec1 = data_fvec1 * alpha_fvec + beta_fvec; @@ -799,7 +798,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, } scalar_t* output_data = output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); const int64_t loop_size = n_channel - (n_channel % bVec::size()); at::parallel_for(0, n_batch * image_size, 1, [&](int64_t begin, int64_t end) { @@ -813,8 +812,7 @@ batch_norm_cpu_channels_last_impl(Tensor& output, const Tensor& input, fVec beta_fvec0 = fVec::loadu(beta_data + d); fVec beta_fvec1 = fVec::loadu(beta_data + d + fVec::size()); bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); fVec out_fvec0 = data_fvec0 * alpha_fvec0 + beta_fvec0; fVec out_fvec1 = data_fvec1 * alpha_fvec1 + beta_fvec1; @@ -839,7 +837,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal( int64_t image_size = input.numel() / n_batch / n_channel; int64_t N = input.numel() / n_channel; - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); param_t* mean_data = mean.data_ptr(); param_t* var_sum_data = var_sum.data_ptr(); @@ -852,8 +850,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal( int64_t d = 0; for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); sum_fvec += data_fvec0; sum_fvec += data_fvec1; } @@ -874,8 +871,7 @@ inline void batch_norm_cpu_collect_stats_contiguous_internal( int64_t d = 0; for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); var_fvec += (data_fvec0 - mean_fvec) * (data_fvec0 - mean_fvec); var_fvec += (data_fvec1 - mean_fvec) * (data_fvec1 - mean_fvec); } @@ -912,7 +908,7 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal( int64_t n_channel = input.size(1); int64_t N = input.numel() / n_channel; - const scalar_t* input_data = input.data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); param_t* mean_data = mean.data_ptr(); param_t* var_sum_data = var_sum.data_ptr(); @@ -929,8 +925,7 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal( int64_t d = 0; for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); fVec sum_fvec0 = fVec::loadu(buffer_ptr + d) + data_fvec0; fVec sum_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()) + data_fvec1; sum_fvec0.store(buffer_ptr + d); @@ -960,10 +955,8 @@ inline void batch_norm_cpu_collect_stats_channels_last_internal( int64_t d = 0; for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { bVec data_bvec = bVec::loadu(input_ptr + d); - fVec data_fvec0, data_fvec1; - std::tie(data_fvec0, data_fvec1) = convert_to_float(data_bvec); - fVec mean_fvec0, mean_fvec1; - std::tie(mean_fvec0, mean_fvec1) = load2f(mean_data + d); + auto [data_fvec0, data_fvec1] = convert_to_float(data_bvec); + auto [mean_fvec0, mean_fvec1] = load2f(mean_data + d); fVec var_fvec0 = fVec::loadu(buffer_ptr + d); fVec var_fvec1 = fVec::loadu(buffer_ptr + d + fVec::size()); var_fvec0 += (data_fvec0 - mean_fvec0) * (data_fvec0 - mean_fvec0); @@ -1013,8 +1006,8 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra int64_t image_size = input.numel() / n_batch / n_channel; int64_t N = input.numel() / n_channel; - const scalar_t* grad_output_data = grad_output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* grad_output_data = grad_output.const_data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr() : nullptr; param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; @@ -1023,11 +1016,11 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra const bool grad_weight_null = grad_weight_data == nullptr; const bool grad_bias_null = grad_bias_data == nullptr; - auto weight_a = conditional_accessor_1d(weight); - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto weight_a = conditional_accessor_1d(weight); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); // parallel dim reduce on 'channel' at::parallel_for(0, n_channel, 1, [&](int64_t begin, int64_t end) { @@ -1053,14 +1046,12 @@ void batch_norm_cpu_backward_contiguous_internal(Tensor& grad_input, Tensor& gra int64_t d = 0; for (; d < image_size - (image_size % bVec::size()); d += bVec::size()) { bVec dy_bvec = bVec::loadu(dy_ptr + d); - fVec dy_fvec0, dy_fvec1; - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); sum_fvec += dy_fvec0; sum_fvec += dy_fvec1; bVec x_bvec = bVec::loadu(x_ptr + d); - fVec x_fvec0, x_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); dotp_fvec += (x_fvec0 - fVec(mean)) * dy_fvec0; dotp_fvec += (x_fvec1 - fVec(mean)) * dy_fvec1; } @@ -1137,18 +1128,18 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& int64_t n_channel = input.size(1); int64_t N = input.numel() / n_channel; - const scalar_t* grad_output_data = grad_output.data_ptr(); - const scalar_t* input_data = input.data_ptr(); + const scalar_t* grad_output_data = grad_output.const_data_ptr(); + const scalar_t* input_data = input.const_data_ptr(); scalar_t* grad_input_data = grad_input.defined() ? grad_input.mutable_data_ptr() : nullptr; param_t* grad_weight_data = grad_weight.defined() ? grad_weight.data_ptr() : nullptr; param_t* grad_bias_data = grad_bias.defined() ? grad_bias.data_ptr() : nullptr; - auto weight_a = conditional_accessor_1d(weight); - auto save_mean_a = conditional_accessor_1d(save_mean); - auto save_invstd_a = conditional_accessor_1d(save_invstd); - auto running_mean_a = conditional_accessor_1d(running_mean); - auto running_var_a = conditional_accessor_1d(running_var); + auto weight_a = conditional_accessor_1d(weight); + auto save_mean_a = conditional_accessor_1d(save_mean); + auto save_invstd_a = conditional_accessor_1d(save_invstd); + auto running_mean_a = conditional_accessor_1d(running_mean); + auto running_var_a = conditional_accessor_1d(running_var); // use float as acc type bool weight_defined = weight.defined(); @@ -1188,16 +1179,14 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& int64_t d = 0; for(; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { bVec dy_bvec = bVec::loadu(dy_ptr + d); - fVec dy_fvec0, dy_fvec1; - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); fVec sum_fvec0 = dy_fvec0 + fVec::loadu(sum_ptr + d); fVec sum_fvec1 = dy_fvec1 + fVec::loadu(sum_ptr + d + fVec::size()); sum_fvec0.store(sum_ptr + d); sum_fvec1.store(sum_ptr + d + fVec::size()); bVec x_bvec = bVec::loadu(x_ptr + d); - fVec x_fvec0, x_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); fVec mean_fvec0 = fVec::loadu(mean_data + d); fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size()); fVec dotp_fvec0 = fVec::loadu(dotp_ptr + d); @@ -1246,8 +1235,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& int64_t d = 0; for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { bVec x_bvec = bVec::loadu(x_ptr + d); - fVec x_fvec0, x_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); fVec mean_fvec0 = fVec::loadu(mean_data + d); fVec mean_fvec1 = fVec::loadu(mean_data + d + fVec::size()); fVec dotp_fvec0 = fVec::loadu(dotp_data + d); @@ -1259,8 +1247,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& fVec dx_fvec0 = (x_fvec0 - mean_fvec0) * k_fvec0; fVec dx_fvec1 = (x_fvec1 - mean_fvec1) * k_fvec1; bVec dy_bvec = bVec::loadu(dy_ptr + d); - fVec dy_fvec0, dy_fvec1; - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); fVec grad_mean_fvec0 = fVec::loadu(sum_data + d) / fVec(N); fVec grad_mean_fvec1 = fVec::loadu(sum_data + d + fVec::size()) / fVec(N); fVec w_fvec0 = fVec::loadu(weight_data + d); @@ -1287,8 +1274,7 @@ void batch_norm_cpu_backward_channels_last_internal(Tensor& grad_input, Tensor& int64_t d = 0; for (; d < n_channel - (n_channel % bVec::size()); d += bVec::size()) { bVec dy_bvec = bVec::loadu(dy_ptr + d); - fVec dy_fvec0, dy_fvec1; - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); fVec invstd_fvec0 = fVec::loadu(invstd_data + d); fVec invstd_fvec1 = fVec::loadu(invstd_data + d + fVec::size()); fVec w_fvec0 = fVec::loadu(weight_data + d); diff --git a/aten/src/ATen/native/cpu/group_norm_kernel.cpp b/aten/src/ATen/native/cpu/group_norm_kernel.cpp index fc7aad9c28708..f6b7f2a5d4813 100644 --- a/aten/src/ATen/native/cpu/group_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/group_norm_kernel.cpp @@ -43,9 +43,9 @@ void GroupNormKernelImplInternal( TORCH_CHECK(!beta.defined() || beta.numel() == C); const int64_t G = group; const int64_t D = C / G; - const T* X_data = X.data_ptr(); - const PT* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const PT* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const T* X_data = X.const_data_ptr(); + const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const PT* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; T* Y_data = Y.data_ptr(); PT* mean_data = mean.data_ptr(); PT* rstd_data = rstd.data_ptr(); @@ -298,9 +298,9 @@ void GroupNormKernelImplChannelsLastInternal( TORCH_CHECK(!beta.defined() || beta.numel() == C); const int64_t G = group; const int64_t D = C / G; - const T* X_data = X.data_ptr(); - const PT* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const PT* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const T* X_data = X.const_data_ptr(); + const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const PT* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; T* Y_data = Y.data_ptr(); PT* mean_data = mean.data_ptr(); PT* rstd_data = rstd.data_ptr(); @@ -442,7 +442,7 @@ void GroupNormKernelImplChannelsLastInternal( // // We could fuse step 3 and 4 into a single session but this way is better: // a. D might be too small for vectorization; - // b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias + // b. Avoid duplicate calculation of scale/bias, each HxW plain share the same scale/bias // for (const auto n : c10::irange(N)) { for (const auto g : c10::irange(G)) { @@ -897,11 +897,11 @@ void GroupNormBackwardKernelImplInternal( TORCH_CHECK(mean.numel() == N * group); TORCH_CHECK(rstd.numel() == N * group); TORCH_CHECK(!gamma.defined() || gamma.numel() == C); - const T* dY_data = dY.data_ptr(); - const T* X_data = X.data_ptr(); - const PT* mean_data = mean.data_ptr(); - const PT* rstd_data = rstd.data_ptr(); - const PT* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const T* dY_data = dY.const_data_ptr(); + const T* X_data = X.const_data_ptr(); + const PT* mean_data = mean.const_data_ptr(); + const PT* rstd_data = rstd.const_data_ptr(); + const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; T* dX_data = dX.defined() ? dX.data_ptr() : nullptr; PT* dgamma_data = dgamma.defined() ? dgamma.data_ptr() : nullptr; PT* dbeta_data = dbeta.defined() ? dbeta.data_ptr() : nullptr; @@ -1377,11 +1377,11 @@ void GroupNormBackwardKernelImplChannelsLastInternal( TORCH_CHECK(!gamma.defined() || gamma.numel() == C); int64_t D = C / group; int64_t G = group; - const T* dY_data = dY.data_ptr(); - const T* X_data = X.data_ptr(); - const PT* mean_data = mean.data_ptr(); - const PT* rstd_data = rstd.data_ptr(); - const PT* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; + const T* dY_data = dY.const_data_ptr(); + const T* X_data = X.const_data_ptr(); + const PT* mean_data = mean.const_data_ptr(); + const PT* rstd_data = rstd.const_data_ptr(); + const PT* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; T* dX_data = dX.defined() ? dX.data_ptr() : nullptr; PT* dgamma_data = dgamma.defined() ? dgamma.data_ptr() : nullptr; PT* dbeta_data = dbeta.defined() ? dbeta.data_ptr() : nullptr; diff --git a/aten/src/ATen/native/cpu/int4mm_kernel.cpp b/aten/src/ATen/native/cpu/int4mm_kernel.cpp new file mode 100644 index 0000000000000..acb4b927f23f5 --- /dev/null +++ b/aten/src/ATen/native/cpu/int4mm_kernel.cpp @@ -0,0 +1,691 @@ +#include +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(_WIN32) || defined(_WIN64)) +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + +namespace at::native { + +namespace { + +inline bool is_block_start(int index, int BLOCK_SIZE) { + return !(index & (BLOCK_SIZE -1)); +} + +#if (defined(CPU_CAPABILITY_AVX512) || defined(CPU_CAPABILITY_AVX2)) && !defined(_MSC_VER) +// convert 16x int4 to int8, handle 64 bits at a time +// used in avx2 and avx512 +inline __m128i conver_int4_to_int8(const uint8_t* data) { + __m128i tmp = _mm_loadu_si64((const __m128i*)data); + __m128i bytes = _mm_cvtepu8_epi16(tmp); + const __m128i lowMask = _mm_set1_epi8(0xF); + __m128i high = _mm_andnot_si128(lowMask, bytes); + __m128i low = _mm_and_si128(lowMask, bytes); + high = _mm_slli_epi16(high, 4); + bytes = _mm_or_si128(low, high); + return bytes; +} +#endif + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +// A block : {BLOCK_M, BLOCK_K}, lda = K +// B block : {BLOCK_K, BLOCK_N / 2}, ldb = BLOCK_N / 2 +// C block : {BLOCK_M, BLOCK_N}, ldc = N +// +// ScaleAndZeros block : {1, BLOCK_N, 2} +// +template +inline void tinygemm_kernel( + const BFloat16* RESTRICT A, + const uint8_t* RESTRICT B, + const BFloat16* RESTRICT ScaleAndZeros, + BFloat16* RESTRICT C, + int lda, + int ldb, + int ldc, + int K, + int BLOCK_K) { + + constexpr int ROWS = BLOCK_M; + constexpr int COLS = BLOCK_N / 16; + + const int PREFETCH_SIZE_K = 16 * 4; + const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K; + + // number of blocks on K + const int KB = K / BLOCK_K; + + __m512 va; + __m512 vb[COLS]; + __m512 vc[ROWS * COLS]; + __m512 scale[COLS]; + __m512 zero[COLS]; + + // Lookup table to de-quantize int4 values to bf16. + // Values are dequantized as truly int4 [-8, 7] range; + // + // dequant = (bf16(int4_value) * bf16_scale) + bf16_zero + // + static const __m512 lut = _mm512_set_ps( + 7.0f, 6.0f, 5.0f, 4.0f, + 3.0f, 2.0f, 1.0f, 0.0f, + -1.0f, -2.0f, -3.0f, -4.0f, + -5.0f, -6.0f, -7.0f, -8.0f); + + // index for transpose + static const __m512i idx1 = _mm512_set_epi32( + 30, 28, 26, 24, 22, 20, 18, 16, + 14, 12, 10, 8, 6, 4, 2, 0); + static const __m512i idx2 = _mm512_set_epi32( + 31, 29, 27, 25, 23, 21, 19, 17, + 15, 13, 11, 9, 7, 5, 3, 1); + + // load scale and zero point + auto load_scale_and_zeros = [&](int i, int _kb) { + // load 2x bfloat16 vector + __m512i t = _mm512_loadu_si512((__m512i*)(ScaleAndZeros + _kb * ldc * 2 + 32 * i)); + if (_kb + PREFETCH_SIZE_KB < KB) { + _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 32 * i, _MM_HINT_T0); + } + + // convert to 2x f32 vector + __m512 a, b; + vec::cvtbf16_fp32(t, a, b); + + // transpose scale_and_zero from {16, 2} to {2, 16} + // inputs: + // a: {s0, z0, s1, z1, ..., s7, z7} + // b: {s8, z8, s9, z9, ..., s15, z15} + // output: + // scale: {s0, s1, s2, ..., s15} + // zero: {z0, z1, z2, ..., z15} + scale[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx1, b); + zero[i] = _mm512_mask_permutex2var_ps(a, 0xffff, idx2, b); + }; + + auto loadc = [&](auto i) { + vc[i] = _mm512_setzero_ps(); + }; + c10::ForcedUnroll{}(loadc); + + auto compute = [&, COLS](auto i, int k) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + + if constexpr (col == 0) { + float aa = static_cast(A[row * lda + k]); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + va = _mm512_set1_ps(aa); + } + + if constexpr (row == 0) { + if constexpr (COLS == 4) { + // when BLOCK_N = 64, handle each row at a time + // to reduce de-quantize overhead. + if constexpr (col == 0) { + __m256i b4 = _mm256_loadu_si256((__m256i*)(B + k * ldb)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0); + } + + __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4)); + vb[0] = _mm512_permutexvar_ps(b32, lut); + vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]); + vb[2] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut); + vb[2] = _mm512_fmadd_ps(vb[2], scale[2], zero[2]); + + b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1)); + vb[1] = _mm512_permutexvar_ps(b32, lut); + vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]); + vb[3] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut); + vb[3] = _mm512_fmadd_ps(vb[3], scale[3], zero[3]); + } + } else { + __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 8); + __m512i b32 = _mm512_cvtepu8_epi32(b8); + vb[col] = _mm512_permutexvar_ps(b32, lut); + vb[col] = _mm512_fmadd_ps(vb[col], scale[col], zero[col]); + } + } + + constexpr int idx = row * COLS + col; + vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); + }; + + for (int k = 0, kb = 0; k < K; ++k) { + if (is_block_start(k, BLOCK_K)) { + c10::ForcedUnroll{}(load_scale_and_zeros, kb++); + } + c10::ForcedUnroll{}(compute, k); + } + + //store to C + auto storec = [&, COLS](auto i) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + if constexpr (COLS == 4) { + // when BLOCK_N = 64, handle each row at a time + // to reduce `cvtfp32_bf16` overhead. + if constexpr (col == 0) { + __m512i c01 = vec::cvtfp32_bf16(vc[row * 4 + 0], vc[row * 4 + 1]); + __m512i c23 = vec::cvtfp32_bf16(vc[row * 4 + 2], vc[row * 4 + 3]); + _mm512_storeu_si512((__m512i*)(C + row * ldc + 0 * 32), c01); + _mm512_storeu_si512((__m512i*)(C + row * ldc + 1 * 32), c23); + } + } else { + __m256i ci = vec::cvtfp32_bf16(vc[i]); + _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 16), ci); + } + }; + c10::ForcedUnroll{}(storec); +} + +#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +template +inline void tinygemm_kernel( + const BFloat16* RESTRICT A, + const uint8_t* RESTRICT B, + const BFloat16* RESTRICT ScaleAndZeros, + BFloat16* RESTRICT C, + int lda, + int ldb, + int ldc, + int K, + int BLOCK_K) { + + constexpr int ROWS = BLOCK_M; + constexpr int COLS = BLOCK_N / 8; + + const int PREFETCH_SIZE_K = 16 * 4; + const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K; + + // number of blocks on K + const int KB = K / BLOCK_K; + + __m256 va; + __m256 vb[COLS]; + __m256 vc[ROWS * COLS]; + __m256 scale[COLS]; + __m256 zero[COLS]; + + static const __m256i idx1 = _mm256_setr_epi32(0, 2, 4, 6, 1, 3, 5, 7); + + // offset to shift from range [0, 15] to [-8, 7] + const __m256 offset = _mm256_set1_ps(-8.0f); + + // load scale and zero point + auto load_scale_and_zeros = [&](int i, int _kb) { + // load 2x bfloat16 vector + __m256i t = _mm256_loadu_si256((__m256i*)(ScaleAndZeros + _kb * ldc * 2 + 16 * i)); + if (_kb + PREFETCH_SIZE_KB < KB) { + _mm_prefetch(ScaleAndZeros + (_kb + PREFETCH_SIZE_KB) * ldc * 2 + 16 * i, _MM_HINT_T0); + } + + // convert to 2x f32 vector + __m256 a, b; + vec::cvtbf16_fp32(t, a, b); + + // transpose scale_and_zero from {8, 2} to {2, 8} + // inputs: + // a: {s0, z0, s1, z1, s2, z2, s3, z3} + // b: {s4, z4, s5, z5, s6, z6, s7, z7} + // output: + // scale: {s0, s1, s2, s3, s4, s5, s6, s7} + // zero: {z0, z1, z2, z3, z4, z5, z6, z7} + a = _mm256_permutevar8x32_ps(a, idx1); + b = _mm256_permutevar8x32_ps(b, idx1); + scale[i] = _mm256_permute2f128_ps(a, b, 0b0100000); + zero[i] = _mm256_permute2f128_ps(a, b, 0b0110001); + + // zero = -8 * scale + zero + zero[i] = _mm256_fmadd_ps(scale[i], offset, zero[i]); + }; + + auto loadc = [&](auto i) { + vc[i] = _mm256_setzero_ps(); + }; + c10::ForcedUnroll{}(loadc); + + auto compute = [&, COLS](auto i, int k) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + + if constexpr (col == 0) { + float aa = static_cast(A[row * lda + k]); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + va = _mm256_set1_ps(aa); + } + + if constexpr (row == 0) { + if constexpr (COLS == 4) { + // when BLOCK_N = 32, handle each row at a time + if constexpr (col == 0) { + __m256i mask = _mm256_set1_epi32(0xF); + __m128i b4 = _mm_loadu_si128((__m128i*)(B + k * ldb)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb, _MM_HINT_T0); + } + + __m256i b32 = _mm256_cvtepu8_epi32(b4); + vb[0] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask)); + vb[0] = _mm256_fmadd_ps(vb[0], scale[0], zero[0]); + vb[2] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4)); + vb[2] = _mm256_fmadd_ps(vb[2], scale[2], zero[2]); + + b32 = _mm256_cvtepu8_epi32(_mm_shuffle_epi32(b4, _MM_SHUFFLE(3, 2, 3, 2))); + vb[1] = _mm256_cvtepi32_ps(_mm256_and_si256(b32, mask)); + vb[1] = _mm256_fmadd_ps(vb[1], scale[1], zero[1]); + vb[3] = _mm256_cvtepi32_ps(_mm256_srli_epi32(b32, 4)); + vb[3] = _mm256_fmadd_ps(vb[3], scale[3], zero[3]); + } + } else { + if constexpr (col % 2 == 0) { + // de-quantize per 64 bits (16x int4) + __m128i b8 = conver_int4_to_int8(B + k * ldb + col * 4); + __m128i b8_val0 = _mm_set1_epi64x(_mm_extract_epi64(b8, 0)); + __m128i b8_val1 = _mm_set1_epi64x(_mm_extract_epi64(b8, 1)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb + col * 4, _MM_HINT_T0); + } + + vb[col] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val0)); + vb[col] = _mm256_fmadd_ps(vb[col], scale[col], zero[col]); + vb[col + 1] = _mm256_cvtepi32_ps(_mm256_cvtepu8_epi32(b8_val1)); + vb[col + 1] = _mm256_fmadd_ps(vb[col + 1], scale[col + 1], zero[col + 1]); + } + } + } + + constexpr int idx = row * COLS + col; + vc[idx] = _mm256_fmadd_ps(va, vb[col], vc[idx]); + }; + for (int k = 0, kb = 0; k < K; ++k) { + if (is_block_start(k, BLOCK_K)) { + c10::ForcedUnroll{}(load_scale_and_zeros, kb++); + } + c10::ForcedUnroll{}(compute, k); + } + + // store to C + auto storec = [&](auto i) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + if constexpr (col % 2 == 0) { + __m256i ci = vec::cvtfp32_bf16(vc[row * COLS + col], vc[row * COLS + col + 1]); + _mm256_storeu_si256((__m256i*)(C + row * ldc + col * 8), ci); + } + }; + c10::ForcedUnroll{}(storec); +} + +#endif + +#if !defined(C10_MOBILE) && defined(__aarch64__) +#include +template +inline void tinygemm_kernel( + const Half* RESTRICT A, + const uint8_t* RESTRICT B, + const Half* RESTRICT ScaleAndZeros, + Half* RESTRICT C, + int lda, + int ldb, + int ldc, + int K, + int BLOCK_K) { + int16_t shift_vals[4] = {0, -4, -8, -12}; + int16x4_t shifts = vld1_s16(shift_vals); + int16x4_t offs = vdup_n_s16(8); + uint16x4_t mask = vdup_n_u16(0x0F); + for (const auto m : c10::irange(BLOCK_M)) { + for (int n = 0; n < BLOCK_N; n+= 16) { + float32x4_t c_val[4]; + float32x4_t scales[4], zeros[4]; + c10::ForcedUnroll<4>{}([&](auto i) { + c_val[i] = vdupq_n_f32(0.0); + }); + for (const auto k : c10::irange(K)) { + const auto a_val = vdupq_n_f32(static_cast(A[m * lda + k])); + if (is_block_start(k, BLOCK_K)) { + int kb = k / BLOCK_K; + c10::ForcedUnroll<4>{}([&](auto i) { + auto scales_and_zeros = vld2_f16(reinterpret_cast(ScaleAndZeros + kb * ldc * 2 + n * 2 + i * 8)); + scales[i] = vcvt_f32_f16(scales_and_zeros.val[0]); + zeros[i] = vcvt_f32_f16(scales_and_zeros.val[1]); + }); + } + c10::ForcedUnroll<4>{}([&](auto i) { + uint16_t b_pack = reinterpret_cast(B + k * ldb + n / 2)[i]; + uint16x4_t b_masked = vand_u16(vshl_u16(vdup_n_u16(b_pack), shifts), mask); + int16x4_t b_ints = vsub_s16(vreinterpret_s16_u16(b_masked), offs); + float32x4_t b_vals = vcvtq_f32_s32(vmovl_s16(b_ints)); + b_vals = vaddq_f32(zeros[i], vmulq_f32(scales[i], b_vals)); + c_val[i] = vfmaq_f32(c_val[i], b_vals, a_val); + }); + } + c10::ForcedUnroll<4>{}([&](auto i) { + vst1_f16(reinterpret_cast(C + m * ldc + n + i * 4), vcvt_f16_f32(c_val[i])); + }); + } + } +} +#endif + +template +inline float convert_int4_to_float(const uint8_t* b, int n) { + static constexpr float lut[16] = { + -8.0f, -7.0f, -6.0f, -5.0f, + -4.0f, -3.0f, -2.0f, -1.0f, + 0.0f, 1.0f, 2.0f, 3.0f, + 4.0f, 5.0f, 6.0f, 7.0f + }; + int index; +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + if constexpr (BLOCK_N == 64) { + const int nb = n/BLOCK_N; + n -= nb*BLOCK_N; + if (n < 32) { + auto val = b[nb * BLOCK_N / 2 + n]; + index = val & 0x0f; + } else { + auto val = b[nb * BLOCK_N / 2 + (n - 32)]; + index = val >> 4; + } + } else +#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + if constexpr (BLOCK_N == 32) { + const int nb = n/BLOCK_N; + n -= nb*BLOCK_N; + if (n < 16) { + auto val = b[nb * BLOCK_N / 2 + n]; + index = val & 0x0f; + } else { + auto val = b[nb * BLOCK_N / 2 + (n - 16)]; + index = val >> 4; + } + } else +#endif + { + const auto is_even = (n & 1) == 0; + auto val = b[n/2]; + index = is_even ? (val & 0x0F) : (val >> 4); + } + return lut[index]; +} + +// non-vectorized version +template +inline void tinygemm_kernel( + const T* RESTRICT A, + const uint8_t* RESTRICT B, + const T* RESTRICT ScaleAndZeros, + T* RESTRICT C, + int lda, + int ldb, + int ldc, + int K, + int BLOCK_K) { + + for (const auto m : c10::irange(BLOCK_M)) { + for (const auto n : c10::irange(BLOCK_N)) { + float c_val = 0; + for (const auto k : c10::irange(K)) { + int kb = k / BLOCK_K; + const auto scale = static_cast(ScaleAndZeros[kb * ldc * 2 + n * 2]); + const auto zero = static_cast(ScaleAndZeros[kb * ldc * 2 + n * 2 + 1]); + const auto a_val = static_cast(A[m * lda + k]); + float b_val = convert_int4_to_float(B + k *ldb, n); + b_val = b_val * scale + zero; + + c_val += a_val * b_val; + } + C[m * ldc + n] = c_val; + } + } +} + + +#define LAUNCH_TINYGEMM_KERNEL(MB_SIZE, NB_SIZE) \ + tinygemm_kernel( \ + A_ptr, B_ptr, S_ptr, C_ptr, \ + K, NB_SIZE / 2, N, K, BLOCK_K); + +#define LAUNCH_TINYGEMM_NB_SIZE(MB_SIZE) \ + switch (nb_size) { \ + case 16: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 16); \ + break; \ + case 32: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 32); \ + break; \ + case 48: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 48); \ + break; \ + case 64: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 64); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported n block size: ", nb_size); \ + break; \ + } + +// NB: int4 weight pack (with BLOCK_N 64) +// weight (int32): {N/64, 64, K} +// packed (uint8): {N/64, K, 32} +// +// 1. avx512 packed format: +// When N is 64, to do 256-bit unpacking at a time, we pack Lane0 with Lane2, +// Lane1 with Lane3 since we can only do shift on a 128-bit basis. +// +// weight: +// [Lane0] N0...15: {a00, a01, a02, ...} +// [Lane1] N16...31: {a10, a11, a12, ...} +// [Lane2] N32...47: {a20, a21, a22, ...} +// [Lane3] N48...63: {a30, a31, a32, ...} +// +// packed: +// [Lane02] N0...31: {a20|a00, a21|a01, a22|a02, ...} +// [Lane13] N32...63: {a30|a10, a31|a11, a32|a12, ...} +// +// Note: when N is 16, 32 or 48, pack with 64-bit format. +// +// 2. avx2 packed format: +// When N is 32, to do 128-bit unpacking at a time. +// +// weight: +// [Lane0] N0...15: { a0, a1, a2, ...} +// [Lane1] N16...32: {a16, a17, a18, ...} +// +// packed: +// [Lane01] N0...32: {a16|a0, a17|a1, a18|a2, ...} +// +// Note: When N is 16, pack with 64-bit format +// +// 3 non-vectorized packed format: +// Do 64-bit unpacking at a time. +// +// weight: {a0, a1, a2, a3, ..., a14, a15} +// packed: {a1|a0, a3, a2, ..., a15|a14} +// +void weight_to_int4pack_kernel( + const Tensor& weight_packed, + const Tensor& weight, + int N, int K) { + + auto weight_packed_data = reinterpret_cast(weight_packed.data_ptr()); + const auto weight_data = weight.data_ptr(); + + // 64 for avx512 and 32 for avx2/non-vectorized + constexpr int BLOCK_N = vec::Vectorized::size() * 4; + const int NB = (N + BLOCK_N - 1) / BLOCK_N; + + // parallel on NB blocks + at::parallel_for(0, NB, 0, [&](int begin, int end) { + for (const auto i : c10::irange(begin, end)) { + int nb_size = std::min(BLOCK_N, N - i * BLOCK_N); + + const int32_t* src = weight_data + i * BLOCK_N * K; + uint8_t* dst = weight_packed_data + i * K * BLOCK_N / 2; + for (const auto k : c10::irange(K)) { +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + if (nb_size == BLOCK_N) { + for (const auto d : c10::irange(16)) { + int32_t val0 = src[(d + 0) * K + k]; + int32_t val1 = src[(d + 16) * K + k]; + int32_t val2 = src[(d + 32) * K + k]; + int32_t val3 = src[(d + 48) * K + k]; + + uint8_t packed02 = (((uint8_t)(val2) << 4)) | ((uint8_t)(val0)); + uint8_t packed13 = (((uint8_t)(val3) << 4)) | ((uint8_t)(val1)); + + dst[k * 32 + d] = packed02; + dst[k * 32 + 16 + d] = packed13; + } + } else { + // for nb_size 16, 32, 48 + for (int n = 0; n < nb_size; n += 2) { + int32_t val0 = src[n * K + k]; + int32_t val1 = src[n * K + K + k]; + + uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); + dst[k * nb_size / 2 + n / 2] = packed; + } + } +#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + if (nb_size == BLOCK_N) { + // for nb_size 32 + for (const auto d : c10::irange(16)) { + int32_t val0 = src[(d + 0) * K + k]; + int32_t val1 = src[(d + 16) * K + k]; + + uint8_t packed01 = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); + dst[k * 16 + d] = packed01; + } + } else { + // for nb_size 16 + for (int n = 0; n < nb_size; n += 2) { + int32_t val0 = src[n * K + k]; + int32_t val1 = src[n * K + K + k]; + + uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); + dst[k * nb_size / 2 + n / 2] = packed; + } + } +#else + for (int n = 0; n < nb_size; n += 2) { + int32_t val0 = src[n * K + k]; + int32_t val1 = src[n * K + K + k]; + + uint8_t packed = (((uint8_t)(val1) << 4)) | ((uint8_t)(val0)); + dst[k * nb_size / 2 + n / 2] = packed; + } +#endif + } + } + }); +} + +template +void int4pack_mm_kernel_( + const Tensor& C, + const Tensor& A, + const Tensor& B, + int qGroupSize, + const Tensor& qScaleAndZeros, + int N, int K) { + + const auto* A_data = A.const_data_ptr(); + const auto* B_data = reinterpret_cast(B.const_data_ptr()); + auto* C_data = C.data_ptr(); + const auto* S_data = qScaleAndZeros.const_data_ptr(); + + int M = A.size(0); + + constexpr int BLOCK_M = 4; + // 64 for avx512 and 32 for avx2/non-vectorized + constexpr int BLOCK_N = vec::Vectorized::size() * 4; + // 32, 64, 128, 256 + const int BLOCK_K = qGroupSize; + + const int MB = (M + BLOCK_M - 1) / BLOCK_M; + const int NB = (N + BLOCK_N - 1) / BLOCK_N; + + at::parallel_for(0, MB * NB, 0, [&](int begin, int end) { + int mb{0}, nb{0}; + data_index_init(begin, mb, MB, nb, NB); + + for (C10_UNUSED const auto i : c10::irange(begin, end)) { + int mb_start = mb * BLOCK_M; + int mb_size = std::min(BLOCK_M, M - mb_start); + int nb_start = nb * BLOCK_N; + int nb_size = std::min(BLOCK_N, N - nb_start); + + const auto* A_ptr = A_data + mb_start * K; + const auto* B_ptr = B_data + nb_start * K / 2; + const auto* S_ptr = S_data + nb_start * 2; + auto* C_ptr = C_data + mb_start * N + nb_start; + + switch (mb_size) { + case 1: + LAUNCH_TINYGEMM_NB_SIZE(1); + break; + case 2: + LAUNCH_TINYGEMM_NB_SIZE(2); + break; + case 3: + LAUNCH_TINYGEMM_NB_SIZE(3); + break; + case 4: + LAUNCH_TINYGEMM_NB_SIZE(4); + break; + default: + TORCH_CHECK(false, "Unsupported m block size: ", mb_size); + } + + // move to the next index + data_index_step(mb, MB, nb, NB); + } + }); +} + +void int4pack_mm_kernel( + const Tensor& C, + const Tensor& A, + const Tensor& B, + int qGroupSize, + const Tensor& qScaleAndZeros, + int N, int K) { + if (C.scalar_type() == kBFloat16) { + int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros, N, K); + } else if (C.scalar_type() == kHalf) { + int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros, N, K); + } else { + int4pack_mm_kernel_(C, A, B, qGroupSize, qScaleAndZeros, N, K); + } +} + +} // anonymous namespace + +ALSO_REGISTER_AVX512_DISPATCH(weight_to_int4pack_stub, &weight_to_int4pack_kernel); +ALSO_REGISTER_AVX512_DISPATCH(int4pack_mm_stub, &int4pack_mm_kernel); + +} // at::native diff --git a/aten/src/ATen/native/cpu/int8mm_kernel.cpp b/aten/src/ATen/native/cpu/int8mm_kernel.cpp new file mode 100644 index 0000000000000..bd266030b2566 --- /dev/null +++ b/aten/src/ATen/native/cpu/int8mm_kernel.cpp @@ -0,0 +1,430 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#if (defined(_WIN32) || defined(_WIN64)) +#define RESTRICT __restrict +#else +#define RESTRICT __restrict__ +#endif + +namespace at::native { + +namespace { + +#if defined(CPU_CAPABILITY_AVX512) && !defined(_MSC_VER) + +// A block : {BLOCK_M, BLOCK_K}, lda = K +// B block : {BLOCK_K, BLOCK_N}, ldb = K +// C block : {BLOCK_M, BLOCK_N}, ldc = N +// +// scales block: {BLOCK_N} +// +template +inline void tinygemm_kernel( + const BFloat16* RESTRICT A, + const int8_t* RESTRICT B, + const BFloat16* RESTRICT scales, + BFloat16* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + + constexpr int ROWS = BLOCK_M; + constexpr int COLS = BLOCK_N; + + const int PREFETCH_SIZE_K = 16 * 4; + + __m512 va; + __m512 vb[COLS]; + __m512 vc[ROWS * COLS]; + __m512 scale[COLS]; + + auto load_scale = [&](int i) { + float ss = static_cast(scales[i]); + scale[i] = _mm512_set1_ps(ss); + }; + c10::ForcedUnroll{}(load_scale); + + auto loadc = [&](auto i) { + vc[i] = _mm512_setzero_ps(); + }; + c10::ForcedUnroll{}(loadc); + + auto compute = [&](auto i, int k) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + + if constexpr (col == 0) { + __m256i a16 = _mm256_load_si256((__m256i*)(A + row * lda + k)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + vec::cvtbf16_fp32(a16, va); + } + + if constexpr (row == 0) { + __m128i b8 = _mm_load_si128((__m128i*)(B + col * ldb + k)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(B + col * ldb + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + __m512i b32 = _mm512_cvtepi8_epi32(b8); + vb[col] = _mm512_cvtepi32_ps(b32); + vb[col] = _mm512_mul_ps(vb[col], scale[col]); + } + + constexpr int idx = row * COLS + col; + vc[idx] = _mm512_fmadd_ps(va, vb[col], vc[idx]); + }; + + for (int k = 0; k < K; k += 16) { + c10::ForcedUnroll{}(compute, k); + } + + auto storec = [&](auto i) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + C[row * ldc + col] = static_cast(_mm512_reduce_add_ps(vc[i])); + }; + c10::ForcedUnroll{}(storec); +} + +#elif defined(CPU_CAPABILITY_AVX2) && !defined(_MSC_VER) + +static inline float _mm256_reduce_add_ps(__m256& v) { + __m256 v1 = _mm256_permute2f128_ps(v, v, 0x1); + v = _mm256_add_ps(v, v1); + v1 = _mm256_shuffle_ps(v, v, 0x4E); + v = _mm256_add_ps(v, v1); + v1 = _mm256_shuffle_ps(v, v, 0xB1); + v = _mm256_add_ps(v, v1); + return _mm256_cvtss_f32(v); +} + +template +inline void tinygemm_kernel( + const BFloat16* RESTRICT A, + const int8_t* RESTRICT B, + const BFloat16* RESTRICT scales, + BFloat16* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + + constexpr int ROWS = BLOCK_M; + constexpr int COLS = BLOCK_N; + + const int PREFETCH_SIZE_K = 16 * 4; + + __m256 va; + __m256 vb[COLS]; + __m256 vc[ROWS * COLS]; + __m256 scale[COLS]; + + auto load_scale = [&](int i) { + float ss = static_cast(scales[i]); + scale[i] = _mm256_set1_ps(ss); + }; + c10::ForcedUnroll{}(load_scale); + + auto loadc = [&](auto i) { + vc[i] = _mm256_setzero_ps(); + }; + c10::ForcedUnroll{}(loadc); + + auto compute = [&](auto i, int k) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + + if constexpr (col == 0) { + __m128i a16 = _mm_load_si128((__m128i*)(A + row * lda + k)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(A + row * lda + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + vec::cvtbf16_fp32(a16, va); + } + + if constexpr (row == 0) { + __m128i b8 = _mm_loadu_si64((__m128i*)(B + col * ldb + k)); + if (k + PREFETCH_SIZE_K < K) { + _mm_prefetch(B + col * ldb + k + PREFETCH_SIZE_K, _MM_HINT_T0); + } + __m256i b32 = _mm256_cvtepi8_epi32(b8); + vb[col] = _mm256_cvtepi32_ps(b32); + vb[col] = _mm256_mul_ps(vb[col], scale[col]); + } + + constexpr int idx = row * COLS + col; + vc[idx] = _mm256_fmadd_ps(va, vb[col], vc[idx]); + }; + + for (int k = 0; k < K; k += 8) { + c10::ForcedUnroll{}(compute, k); + } + + auto storec = [&](auto i) { + constexpr int row = i / COLS; + constexpr int col = i % COLS; + C[row * ldc + col] = static_cast(_mm256_reduce_add_ps(vc[i])); + }; + c10::ForcedUnroll{}(storec); +} + +#endif + +#if !defined(C10_MOBILE) && defined(__aarch64__) +#include + +inline float reduce(float32x4_t x) { + auto sum = vpaddq_f32(x, x); + return vgetq_lane_f32(vpaddq_f32(sum, sum), 0); +} + +inline float32x4x2_t load_as_float32x4x2(const Half* ptr) { + float16x8_t f16_val = vld1q_f16(reinterpret_cast(ptr)); + auto val_low = vcvt_f32_f16(vget_low_f16(f16_val)); + auto val_high = vcvt_f32_f16(vget_high_f16(f16_val)); + return {val_low, val_high}; +} + +inline float32x4_t load_as_float32x4(const Half* ptr) { + return vcvt_f32_f16(vld1_f16(reinterpret_cast(ptr))); +} + +inline float32x4x2_t load_as_float32x4x2(const BFloat16* ptr) { + int32x4_t shift = vdupq_n_s32(16); + uint16x8_t u16_val = vld1q_u16(reinterpret_cast(ptr)); + uint32x4_t int_low = vmovl_u16(vget_low_u16(u16_val)); + uint32x4_t int_high = vmovl_u16(vget_high_u16(u16_val)); + return {vreinterpretq_f32_u32(vshlq_u32(int_low, shift)), vreinterpretq_f32_u32(vshlq_u32(int_high, shift))}; +} + +inline float32x4_t load_as_float32x4(const BFloat16* ptr) { + int32x4_t shift = vdupq_n_s32(16); + uint32x4_t as_int = vmovl_u16(vld1_u16(reinterpret_cast(ptr))); + return vreinterpretq_f32_u32(vshlq_u32(as_int, shift)); +} + +inline float32x4_t load_as_float32x4(const float* ptr) { + return vld1q_f32(ptr); +} + +inline float32x4x2_t load_as_float32x4x2(const float* ptr) { + return {vld1q_f32(ptr), vld1q_f32(ptr + 4)}; +} + +template +inline void tinygemm_kernel_( + const T* RESTRICT A, + const int8_t* RESTRICT B, + const T* RESTRICT scales, + T* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + + for (const auto m : c10::irange(BLOCK_M)) { + float32x4_t c_val[BLOCK_N]; + c10::ForcedUnroll{}([&](auto i) { + c_val[i] = vdupq_n_f32(0.0); + }); + for (int k = 0; k < K; k += 8) { + auto a_val = load_as_float32x4x2(A + m * lda + k); + c10::ForcedUnroll{}([&](auto i) { + int16x8_t b_val = vmovl_s8(vld1_s8(B + i * ldb + k)); + auto b_val_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(b_val))); + auto b_val_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(b_val))); + c_val[i] = vfmaq_f32(c_val[i], a_val.val[1], b_val_high); + c_val[i] = vfmaq_f32(c_val[i], a_val.val[0], b_val_low); + }); + } + + float32x4_t scale_val = load_as_float32x4(scales); + c10::ForcedUnroll{}([&](auto i) { + C[m * ldc + i] = reduce(c_val[i]) * vgetq_lane_f32(scale_val, i); + }); + } +} + +template +inline void tinygemm_kernel( + const Half* RESTRICT A, + const int8_t* RESTRICT B, + const Half* RESTRICT scales, + Half* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + tinygemm_kernel_(A, B, scales, C, lda, ldb, ldc, K); +} + +template +inline void tinygemm_kernel( + const BFloat16* RESTRICT A, + const int8_t* RESTRICT B, + const BFloat16* RESTRICT scales, + BFloat16* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + tinygemm_kernel_(A, B, scales, C, lda, ldb, ldc, K); +} + +template +inline void tinygemm_kernel( + const float* RESTRICT A, + const int8_t* RESTRICT B, + const float* RESTRICT scales, + float* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + tinygemm_kernel_(A, B, scales, C, lda, ldb, ldc, K); +} +#endif + +// non-vectorized version +template +inline void tinygemm_kernel( + const T* RESTRICT A, + const int8_t* RESTRICT B, + const T* RESTRICT scales, + T* RESTRICT C, + int lda, + int ldb, + int ldc, + int K) { + + for (const auto m : c10::irange(BLOCK_M)) { + for (const auto n : c10::irange(BLOCK_N)) { + float c_val = 0; + float scale_val = static_cast(scales[n]); + for (const auto k : c10::irange(K)) { + float a_val = static_cast(A[m * lda + k]); + float b_val = static_cast(B[n * ldb + k]); + c_val += a_val * (b_val * scale_val); + } + C[m * ldc + n] = c_val; + } + } +} + +#define LAUNCH_TINYGEMM_KERNEL(MB_SIZE, NB_SIZE) \ + tinygemm_kernel( \ + A_ptr, B_ptr, S_ptr, C_ptr, \ + K, K, N, K); + +#define LAUNCH_TINYGEMM_NB_SIZE(MB_SIZE) \ + switch (nb_size) { \ + case 1: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 1); \ + break; \ + case 2: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 2); \ + break; \ + case 3: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 3); \ + break; \ + case 4: \ + LAUNCH_TINYGEMM_KERNEL(MB_SIZE, 4); \ + break; \ + default: \ + TORCH_CHECK(false, "Unsupported n block size: ", nb_size); \ + break; \ + } + +template +void int8pack_mm_kernel_( + const Tensor& C, + const Tensor& A, + const Tensor& B, + const Tensor& scales) { + + const auto* A_data = A.const_data_ptr(); + const auto* B_data = B.const_data_ptr(); + auto* C_data = C.data_ptr(); + const auto* S_data = scales.const_data_ptr(); + + int M = A.size(0); + int N = B.size(0); + int K = A.size(1); + + constexpr int BLOCK_M = 4; + constexpr int BLOCK_N = 4; + + const int MB = (M + BLOCK_M - 1) / BLOCK_M; + const int NB = (N + BLOCK_N - 1) / BLOCK_N; + + at::parallel_for(0, MB * NB, 0, [&](int begin, int end) { + int mb{0}, nb{0}; + data_index_init(begin, mb, MB, nb, NB); + + for (const auto i : c10::irange(begin, end)) { + (void)i; + + int mb_start = mb * BLOCK_M; + int mb_size = std::min(BLOCK_M, M - mb_start); + int nb_start = nb * BLOCK_N; + int nb_size = std::min(BLOCK_N, N - nb_start); + + const auto* A_ptr = A_data + mb_start * K; + const auto* B_ptr = B_data + nb_start * K; + const auto* S_ptr = S_data + nb_start; + auto* C_ptr = C_data + mb_start * N + nb_start; + + switch (mb_size) { + case 1: + LAUNCH_TINYGEMM_NB_SIZE(1); + break; + case 2: + LAUNCH_TINYGEMM_NB_SIZE(2); + break; + case 3: + LAUNCH_TINYGEMM_NB_SIZE(3); + break; + case 4: + LAUNCH_TINYGEMM_NB_SIZE(4); + break; + default: + TORCH_CHECK(false, "Unsupported m block size: ", mb_size); + } + + // move to the next index + data_index_step(mb, MB, nb, NB); + } + }); +} + +void int8pack_mm_kernel( + const Tensor& C, + const Tensor& A, + const Tensor& B, + const Tensor& scales) { + if (C.dtype() == kHalf) { + int8pack_mm_kernel_(C, A, B, scales); + } else if (C.dtype() == kBFloat16) { + int8pack_mm_kernel_(C, A, B, scales); + } else { + int8pack_mm_kernel_(C, A, B, scales); + } +} + +} // anonymous namespace + +ALSO_REGISTER_AVX512_DISPATCH(int8pack_mm_stub, &int8pack_mm_kernel); + +} // at::native diff --git a/aten/src/ATen/native/cpu/int_mm_kernel.h b/aten/src/ATen/native/cpu/int_mm_kernel.h new file mode 100644 index 0000000000000..f215078d61f91 --- /dev/null +++ b/aten/src/ATen/native/cpu/int_mm_kernel.h @@ -0,0 +1,16 @@ +#pragma once + +#include +#include + +namespace at::native { + +using weight_to_int4pack_fn = void(*)(const Tensor&, const Tensor&, int, int); +using int4pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, int, const Tensor&, int, int); +using int8pack_mm_fn = void(*)(const Tensor&, const Tensor&, const Tensor&, const Tensor&); + +DECLARE_DISPATCH(weight_to_int4pack_fn, weight_to_int4pack_stub); +DECLARE_DISPATCH(int4pack_mm_fn, int4pack_mm_stub); +DECLARE_DISPATCH(int8pack_mm_fn, int8pack_mm_stub); + +} // namespace at::native diff --git a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp index a668305d462ab..c2dbd0d7c7858 100644 --- a/aten/src/ATen/native/cpu/layer_norm_kernel.cpp +++ b/aten/src/ATen/native/cpu/layer_norm_kernel.cpp @@ -36,9 +36,9 @@ void LayerNormKernelImplInternal( Tensor* mean, Tensor* rstd) { using Vec = vec::Vectorized; - const T* X_data = X.data_ptr(); - const T* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const T* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const T* X_data = X.const_data_ptr(); + const T* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const T* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; T* Y_data = Y->data_ptr(); T* mean_data = mean ? mean->data_ptr() : nullptr; T* rstd_data = rstd ? rstd->data_ptr() : nullptr; @@ -51,9 +51,7 @@ void LayerNormKernelImplInternal( for (const auto i : c10::irange(start, end)) { const T* X_ptr = X_data + i * N; T* Y_ptr = Y_data + i * N; - T mean_val; - T rstd_val; - std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N); + auto [mean_val, rstd_val] = RowwiseMoments(X_ptr, N); rstd_val = T(1) / std::sqrt(rstd_val + eps); const T scale = rstd_val; const T bias = - mean_val; @@ -98,9 +96,9 @@ void layer_norm_kernel_mixed_type( Tensor* rstd) { using bVec = Vectorized; using fVec = Vectorized; - const T* X_data = X.data_ptr(); - const param_t* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const param_t* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const T* X_data = X.const_data_ptr(); + const param_t* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const param_t* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; T* Y_data = Y->data_ptr(); param_t* mean_data = mean ? mean->data_ptr() : nullptr; param_t* rstd_data = rstd ? rstd->data_ptr() : nullptr; @@ -113,9 +111,7 @@ void layer_norm_kernel_mixed_type( for (const auto i : c10::irange(start, end)) { const T* X_ptr = X_data + i * N; T* Y_ptr = Y_data + i * N; - float mean_val; - float rstd_val; - std::tie(mean_val, rstd_val) = RowwiseMoments(X_ptr, N); + auto [mean_val, rstd_val] = RowwiseMoments(X_ptr, N); rstd_val = float(1) / std::sqrt(rstd_val + eps); const float scale = rstd_val; const float bias = -rstd_val * mean_val; @@ -373,10 +369,9 @@ void layer_norm_backward_frame( if (N < bVec::size()) { bVec x_bvec = bVec::loadu(X_ptr, N); bVec dy_bvec = bVec::loadu(dY_ptr, N); - fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); - std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data, N); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); + auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data, N); if (N > fVec::size()) { fVec db_fvec0 = dy_fvec0 * gamma_fvec0; fVec db_fvec1 = dy_fvec1 * gamma_fvec1; @@ -396,11 +391,10 @@ void layer_norm_backward_frame( int64_t d = bVec::size(); bVec x_bvec = bVec::loadu(X_ptr); bVec dy_bvec = bVec::loadu(dY_ptr); - fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1; fVec ds_fvec0, ds_fvec1, db_fvec0, db_fvec1, acc_ds_fvec0, acc_ds_fvec1, acc_db_fvec0, acc_db_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); - std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); + auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data); acc_db_fvec0 = dy_fvec0 * gamma_fvec0; acc_db_fvec1 = dy_fvec1 * gamma_fvec1; acc_ds_fvec0 = x_fvec0 * acc_db_fvec0; @@ -470,10 +464,9 @@ void layer_norm_backward_frame( for (; d < N - (N % bVec::size()); d += bVec::size()) { bVec x_bvec = bVec::loadu(X_ptr + d); bVec dy_bvec = bVec::loadu(dY_ptr + d); - fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); - std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); + auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data + d); fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c); fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c); bVec r_bvec = convert_from_float(r_fvec0, r_fvec1); @@ -482,10 +475,9 @@ void layer_norm_backward_frame( if (N - d > 0) { bVec x_bvec = bVec::loadu(X_ptr + d, N - d); bVec dy_bvec = bVec::loadu(dY_ptr + d, N - d); - fVec x_fvec0, x_fvec1, dy_fvec0, dy_fvec1, gamma_fvec0, gamma_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); - std::tie(dy_fvec0, dy_fvec1) = convert_to_float(dy_bvec); - std::tie(gamma_fvec0, gamma_fvec1) = load2f(gamma_data + d, N - d); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); + auto [dy_fvec0, dy_fvec1] = convert_to_float(dy_bvec); + auto [gamma_fvec0, gamma_fvec1] = load2f(gamma_data + d, N - d); fVec r_fvec0 = fVec(a) * dy_fvec0 * gamma_fvec0 + fVec(b) * x_fvec0 + fVec(c); fVec r_fvec1 = fVec(a) * dy_fvec1 * gamma_fvec1 + fVec(b) * x_fvec1 + fVec(c); bVec r_bvec = convert_from_float(r_fvec0, r_fvec1); @@ -513,12 +505,12 @@ void LayerNormBackwardKernelImplInternal( TORCH_DCHECK_EQ(mean.numel(), M); TORCH_DCHECK_EQ(rstd.numel(), M); DCHECK(!gamma.defined() || gamma.numel() == N); - const T* dY_data = dY.template data_ptr(); - const T* X_data = X.template data_ptr(); - const T2* mean_data = mean.template data_ptr(); - const T2* rstd_data = rstd.template data_ptr(); + const T* dY_data = dY.template const_data_ptr(); + const T* X_data = X.template const_data_ptr(); + const T2* mean_data = mean.template const_data_ptr(); + const T2* rstd_data = rstd.template const_data_ptr(); const T2* gamma_data = - gamma.defined() ? gamma.template data_ptr() : nullptr; + gamma.defined() ? gamma.template const_data_ptr() : nullptr; T* dX_data = dX->defined() ? dX->template data_ptr() : nullptr; T2* dgamma_data = dgamma->defined() ? dgamma->template data_ptr() : nullptr; T2* dbeta_data = dbeta->defined() ? dbeta->template data_ptr() : nullptr; diff --git a/aten/src/ATen/native/cpu/moments_utils.h b/aten/src/ATen/native/cpu/moments_utils.h index c89aa6b3f602d..f5337f5ff4ebe 100644 --- a/aten/src/ATen/native/cpu/moments_utils.h +++ b/aten/src/ATen/native/cpu/moments_utils.h @@ -93,8 +93,7 @@ UpdateMomentsVec( fVec m2_fvec0(0), m2_fvec1(0); for (const auto j : c10::irange(m0)) { const Vec x_bvec = Vec::loadu(X_ptr + j * Vec::size()); - fVec x_fvec0, x_fvec1; - std::tie(x_fvec0, x_fvec1) = convert_to_float(x_bvec); + auto [x_fvec0, x_fvec1] = convert_to_float(x_bvec); const fVec delta_fvec0 = x_fvec0 - m1_fvec0; const fVec delta_fvec1 = x_fvec1 - m1_fvec1; m1_fvec0 += delta_fvec0 * c_vecs[j]; diff --git a/aten/src/ATen/native/cpu/utils.h b/aten/src/ATen/native/cpu/utils.h index 6607c287cf0e3..641ac0cd06125 100644 --- a/aten/src/ATen/native/cpu/utils.h +++ b/aten/src/ATen/native/cpu/utils.h @@ -21,6 +21,11 @@ inline void _store(at::BFloat16* dst, at::vec::Vectorized src) { res.store(dst, at::vec::Vectorized::size()); } +inline void _store(at::Half* dst, at::vec::Vectorized src) { + auto res = at::vec::convert_float_half(src, src); + res.store(dst, at::vec::Vectorized::size()); +} + inline namespace CPU_CAPABILITY { template @@ -56,8 +61,7 @@ struct Vec2 { Vec2(Vectorized v0, Vectorized v1) : val0(v0), val1(v1) {} Vec2(float v) : val0(v), val1(v) {} static Vec2 loadu(const BFloat16* ptr) { - Vectorized v0, v1; - std::tie(v0, v1) = convert_bfloat16_float(Vectorized::loadu(ptr)); + auto [v0, v1] = convert_bfloat16_float(Vectorized::loadu(ptr)); return {v0, v1}; } static Vec2 loadu(const float* ptr) { diff --git a/aten/src/ATen/native/cpu/zmath.h b/aten/src/ATen/native/cpu/zmath.h index 3f3971e6e76e2..9b52039e84f91 100644 --- a/aten/src/ATen/native/cpu/zmath.h +++ b/aten/src/ATen/native/cpu/zmath.h @@ -2,7 +2,6 @@ // Complex number math operations that act as no-ops for other dtypes. #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/Activation.cpp b/aten/src/ATen/native/cuda/Activation.cpp index 633a5f386a87e..6bbfd985d3572 100644 --- a/aten/src/ATen/native/cuda/Activation.cpp +++ b/aten/src/ATen/native/cuda/Activation.cpp @@ -44,8 +44,8 @@ Tensor& glu_backward_cuda_out(const Tensor& grad_output, const Tensor& input, const auto iter = at::TensorIteratorConfig() .add_output(grad_input) - .add_input(input) - .add_input(grad_output) + .add_const_input(input) + .add_const_input(grad_output) .resize_outputs(false) .declare_static_shape(iter_shape) .build(); @@ -80,7 +80,7 @@ std::tuple log_sigmoid_forward_out_cuda(const Tensor& input, T // NOTE: buffer is only used by CPU dispatch, we just ignore it here auto iter = TensorIteratorConfig() .add_output(result) - .add_input(input) + .add_const_input(input) .build(); launch_log_sigmoid_forward_kernel(iter); return std::forward_as_tuple(result, buffer); diff --git a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu index 32cfb8fcf0339..9db469cd4f752 100644 --- a/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu +++ b/aten/src/ATen/native/cuda/AdaptiveAveragePooling.cu @@ -703,7 +703,7 @@ namespace { ); } while (!done && max_threads); if (!done) { - TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate shaedMemPerBlock limit"); + TORCH_INTERNAL_ASSERT(false, "Couldn't reduce launch bounds to accomodate sharedMemPerBlock limit"); } break; } diff --git a/aten/src/ATen/native/cuda/AveragePool3d.cu b/aten/src/ATen/native/cuda/AveragePool3d.cu index a722236ea57cc..f4b0ee00d9a9a 100644 --- a/aten/src/ATen/native/cuda/AveragePool3d.cu +++ b/aten/src/ATen/native/cuda/AveragePool3d.cu @@ -34,7 +34,7 @@ __device__ inline int max(int a, int b) { template __global__ void avg_pool3d_cuda_update_output( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int kT, int kH, int kW, int dT, int dH, int dW, @@ -88,7 +88,7 @@ __global__ void avg_pool3d_cuda_update_output( { for (wi = wstart; wi < wend; ++wi) { - scalar_t val = input[slice][ti][hi][wi]; + const scalar_t val = input[slice][ti][hi][wi]; sum += val; } } @@ -103,7 +103,7 @@ __global__ void avg_pool3d_cuda_update_output( // template __global__ void avg_pool3d_cuda_update_output( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int kT, int kH, int dT, int dH, int dW, @@ -157,7 +157,7 @@ __global__ void avg_pool3d_cuda_update_output( { for (wi = wstart; wi < wend; ++wi) { - scalar_t val = input[slice][ti][hi][wi]; + const scalar_t val = input[slice][ti][hi][wi]; sum += val; } } @@ -169,7 +169,7 @@ __global__ void avg_pool3d_cuda_update_output( template __global__ void avg_pool3d_single_backward_out_frame_stride1( - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, PackedTensorAccessor64 gradInput, int kT, int kH, int kW, accscalar_t normFactor, @@ -184,7 +184,7 @@ __global__ void avg_pool3d_single_backward_out_frame_stride1( if (iRow < gradInput.size(2) && iCol < gradInput.size(3)) { accscalar_t sum = 0.0; - scalar_t *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)] + const scalar_t *gOut = &gradOutput[slice][max(0, iFrame - kT + 1)] [max(0, iRow - kH + 1)][max(0, iCol - kW + 1)]; int frameOffset = 0; for (int oFrame = max(0, iFrame - kT + 1); @@ -214,7 +214,7 @@ __global__ void avg_pool3d_single_backward_out_frame_stride1( template __global__ void avg_pool3d_cuda_update_grad_input_atomic( - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, PackedTensorAccessor64 gradInput, int kT, int kH, int kW, int dT, int dH, int dW, @@ -273,7 +273,7 @@ __global__ void avg_pool3d_cuda_update_grad_input_atomic( template __global__ void avg_pool3d_cuda_update_grad_input( - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, PackedTensorAccessor64 gradInput, int kT, int kH, int kW, int dT, int dH, int dW, @@ -333,7 +333,7 @@ __global__ void avg_pool3d_cuda_update_grad_input( #define LAUNCH_UPDATE_OUTPUT_KERNEL_WIDTH(KW) case KW: \ avg_pool3d_cuda_update_output \ <<>>( \ - work_input.packed_accessor64(), \ + work_input.packed_accessor64(), \ work_output.packed_accessor64(), \ kT, kH, \ dT, dH, dW, \ @@ -422,7 +422,7 @@ TORCH_IMPL_FUNC(avg_pool3d_out_cuda) ( default: avg_pool3d_cuda_update_output <<>>( - work_input.packed_accessor64(), + work_input.packed_accessor64(), work_output.packed_accessor64(), kT, kH, kW, dT, dH, dW, @@ -543,7 +543,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) ( avg_pool3d_single_backward_out_frame_stride1 <<>>( - work_grad_output.packed_accessor64(), + work_grad_output.packed_accessor64(), work_grad_input.packed_accessor64(), kT, kH, kW, 1.0f/divide_factor, @@ -573,7 +573,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) ( if (kernelsOverlap) { avg_pool3d_cuda_update_grad_input_atomic <<>>( - work_grad_output.packed_accessor64(), + work_grad_output.packed_accessor64(), work_grad_input.packed_accessor64(), kT, kH, kW, dT, dH, dW, @@ -585,7 +585,7 @@ TORCH_IMPL_FUNC(avg_pool3d_backward_out_cuda) ( else { avg_pool3d_cuda_update_grad_input <<>>( - work_grad_output.packed_accessor64(), + work_grad_output.packed_accessor64(), work_grad_input.packed_accessor64(), kT, kH, kW, dT, dH, dW, diff --git a/aten/src/ATen/native/cuda/Blas.cpp b/aten/src/ATen/native/cuda/Blas.cpp index 35a247725a3ea..df6f470916428 100644 --- a/aten/src/ATen/native/cuda/Blas.cpp +++ b/aten/src/ATen/native/cuda/Blas.cpp @@ -6,6 +6,8 @@ #include #include #include +#include +#include #include #include @@ -17,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -25,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -153,7 +157,7 @@ enum class Activation { GELU, }; -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activation a) { switch (a) { case Activation::None: @@ -172,6 +176,12 @@ cuda::blas::GEMMAndBiasActivationEpilogue activation_to_gemm_and_blas_arg(Activa static bool getDisableAddmmCudaLt() { static const char* env_value = std::getenv("DISABLE_ADDMM_CUDA_LT"); #ifdef USE_ROCM + // if we enable tunable op, it'll take priority over just hipblaslt (heuristics) + // note the current tunable op is not the hipblaslt path (gemm_and_bias) + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { + return true; + } // allow both CUDA and HIP env var names for ROCm builds // also, current default for ROCm builds is disable by default if (env_value == nullptr) { @@ -226,7 +236,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma at::ScalarType scalar_type = self.scalar_type(); c10::MaybeOwned self_; if (&result != &self) { -#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11040 && !defined(_MSC_VER)) || defined(USE_ROCM) && ROCM_VERSION >= 50700 +#if (defined(CUDA_VERSION) && (CUDA_VERSION >= 11040)) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700)) // Strangely, if mat2 has only 1 row or column, we get // CUBLAS_STATUS_INVALID_VALUE error from cublasLtMatmulAlgoGetHeuristic. // self.dim() == 1 && result.dim() == 2 && self.sizes()[0] == mat2_sizes[1] @@ -250,10 +260,13 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma scalar_type == at::ScalarType::Half || scalar_type == at::ScalarType::BFloat16) && #endif +#if (defined(CUDA_VERSION) && CUDA_VERSION >= 12010 && !defined(USE_ROCM)) + mat2_sizes[0] > 1 && mat2_sizes[1] > 1; +#else mat2_sizes[0] > 1 && mat2_sizes[1] > 1 && mat2_sizes[0] < 65535 * 32 && mat2_sizes[1] < 65535 * 32 && mat1_sizes[0] < 65535 * 32 && mat1_sizes[1] < 65535 * 32 && - // avoid leaing dim >> rows bugs + // avoid leading dim >> rows bugs ((mat1.strides()[0] == 1 && mat1.strides()[1] == mat1_sizes[0]) || (mat1.strides()[1] == 1 && mat1.strides()[0] == mat1_sizes[1]) || (scalar_type != at::ScalarType::Half && @@ -262,6 +275,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma (mat2.strides()[1] == 1 && mat2.strides()[0] == mat2_sizes[1]) || (scalar_type != at::ScalarType::Half && scalar_type != at::ScalarType::BFloat16)); +#endif } #endif if (!useLtInterface) { @@ -309,7 +323,7 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma // That requires some fixing some internal build dependencies though. return at::mul_out( result, - self, + self.expand(result.sizes()), at::native::scalar_tensor( beta, self.scalar_type(), @@ -320,8 +334,9 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma TORCH_INTERNAL_ASSERT_DEBUG_ONLY(!args.result->is_conj()); -#if (!defined(USE_ROCM) && !defined(_MSC_VER)) || (defined(USE_ROCM) && ROCM_VERSION >= 50700) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 50700)) if (useLtInterface) { +#if defined(USE_ROCM) AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, @@ -335,26 +350,53 @@ Tensor& addmm_out_cuda_impl(Tensor& result, const Tensor& self, const Tensor& ma args.n, args.k, alpha.to>(), - args.mata->data_ptr(), + args.mata->const_data_ptr(), args.lda, - args.matb->data_ptr(), + args.matb->const_data_ptr(), args.ldb, - self.const_data_ptr(), + // This condition is needed for mm case on ROCm for hipblasLt path. + // Passing the bias ptr as null to avoid accuracy issues for mm case. + (&result != &self) ? self.const_data_ptr() : nullptr, args.result->data_ptr(), args.result_ld, -#if (defined(CUDA_VERSION) && CUDA_VERSION >= 11080) || defined(USE_ROCM) activation_to_gemm_and_blas_arg(activation) + ); + }); #else - // GELU is not supported (and does not compile!) prior - // to CUDA 11.4. Have observed accuracy issues with - // GELU epilogue in 11.4; disabling the GELU epilogue - // path for CUDA version < 11.8. - activation != Activation::GELU - ? activation_to_gemm_and_blas_arg(activation) - : cuda::blas::GEMMAndBiasActivationEpilogue::None + auto activation_epilogue = activation_to_gemm_and_blas_arg(activation); +#if (defined(CUDA_VERSION) && (CUDA_VERSION < 11080)) + // GELU is not supported (and does not compile!) prior + // to CUDA 11.4. Have observed accuracy issues with + // GELU epilogue in 11.4; disabling the GELU epilogue + // path for CUDA version < 11.8. + if (activation == Activation::GELU) + activation_epilogue = cuda::blas::GEMMAndBiasActivationEpilogue::None; #endif + + AT_DISPATCH_FLOATING_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + scalar_type, + "addmm_cuda_lt", + [&] { + at::cuda::blas::gemm_and_bias( + args.transa == 't', + args.transb == 't', + args.m, + args.n, + args.k, + alpha.to>(), + args.mata->const_data_ptr(), + args.lda, + args.matb->const_data_ptr(), + args.ldb, + self.const_data_ptr(), + args.result->data_ptr(), + args.result_ld, + activation_epilogue ); }); +#endif } else #endif { @@ -728,7 +770,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result) TORCH_CHECK(result.is_contiguous(), "Expected result to be contiguous."); -#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION) && CUDA_VERSION >= 11070 +#if (!defined(USE_ROCM) && defined(CUDA_VERSION) && (CUDA_VERSION >= 11070)) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000)) cublasCommonArgs args(self, mat2, result); at::cuda::blas::int8_gemm( @@ -748,7 +790,7 @@ Tensor& _int_mm_out_cuda(const Tensor& self, const Tensor& mat2, Tensor& result) result.copy_(*args.result); } #else -#if !defined(USE_ROCM) && !defined(_MSC_VER) && defined(CUDA_VERSION) +#if !defined(USE_ROCM) && defined(CUDA_VERSION) TORCH_CHECK(false, "_int_mm_out_cuda not compiled for CUDA ", CUDA_VERSION); #else TORCH_CHECK(false, "_int_mm_out_cuda not compiled for this platform."); @@ -763,12 +805,42 @@ Tensor _int_mm_cuda(const Tensor& self, const Tensor& mat2) { return _int_mm_out_cuda(self, mat2, result); } +static bool _scaled_mm_allowed_device() { + auto dprops = at::cuda::getCurrentDeviceProperties(); +#ifdef USE_ROCM + std::string device_arch = dprops->gcnArchName; + static const std::vector archs = {"gfx940", "gfx941", "gfx942"}; + for (std::string arch : archs) { + size_t substring = device_arch.find(arch); + if (substring != std::string::npos) { + return true; + } + } + return false; +#else + return dprops->major >= 9 || (dprops->major == 8 && dprops->minor == 9); +#endif +} + // Computes matrix multiply + bias while applying scaling to input and output matrices and computes amax // Scales are only applicable when matrices are of Float8 type and assumbed to be equal to 1.0 by default. // If output matrix type is 16 or 32-bit type, neither scale_result is applied nor amax is computed. // Known limitations: // - Only works if mat1 is row-major and mat2 is column-major // - Only works if matrices sizes are divisible by 32 +// +// Arguments: +// - `mat1`: the first operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `mat2`: the second operand of the matrix multiply, can be type `torch.float8_e4m3fn` or `torch.float8_e5m2` +// - `bias`: the bias, can be type `torch.float16` or `torch.bfloat16` +// - `out_dtype`: the output dtype, can either be a float8 or a higher precision floating point type +// - `scale_a`: a scalar tensor with the inverse scale of `mat1`, only needed if `mat1` is a float8 type +// - `scale_b`: a scalar tensor with the inverse scale of `mat2`, only needed if `mat2` is a float8 type +// - `scale_result`: a scalar tensor with the scale of the output, only set if the output is a float8 type +// - `use_fast_accum`: if true, enables fast float8 accumulation +// - `out`: a reference to the output tensor +// - `amax`: a reference to the amax tensor of the output, only needed if the output is a float8 type and will be updated inplace + std::tuple _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, const c10::optional& bias, @@ -779,8 +851,8 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, bool use_fast_accum, Tensor& out, Tensor& amax) { // Check sizes - auto dprops = at::cuda::getCurrentDeviceProperties(); - TORCH_CHECK(dprops->major >= 9, "torch._scaled_mm is only supported on devices with compute capability >= 9.0)"); + bool allowed_device = _scaled_mm_allowed_device(); + TORCH_CHECK(allowed_device, "torch._scaled_mm is only supported on CUDA devices with compute capability >= 9.0 or 8.9, or ROCm MI300+"); TORCH_CHECK(mat1.dim() == 2, "mat1 must be a matrix"); TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); TORCH_CHECK( @@ -796,7 +868,7 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, " but got ", bias->numel()); TORCH_CHECK( mat1.sizes()[1] % 16 == 0, - "Expected trailing dimension of mat1 to be divisble by 16 ", + "Expected trailing dimension of mat1 to be divisible by 16 ", "but got mat1 shape: (", mat1.sizes()[0], "x", @@ -838,36 +910,121 @@ _scaled_mm_out_cuda(const Tensor& mat1, const Tensor& mat2, at::native::resize_output(out, {mat1_sizes[0], mat2_sizes[1]}); at::native::resize_output(amax, {}); -#if !defined(USE_ROCM) && !defined(_MSC_VER) +#if !defined(USE_ROCM) || (defined(USE_ROCM) && (ROCM_VERSION >= 60000)) cublasCommonArgs args(mat1, mat2, out); const auto out_dtype_ = args.result->scalar_type(); TORCH_CHECK(args.transa == 't' && args.transb == 'n', "Only multiplication of row-major and column-major matrices is supported by cuBLASLt"); - at::cuda::blas::scaled_gemm( - args.transa, - args.transb, - args.m, - args.n, - args.k, - args.mata->data_ptr(), - scale_a ? scale_a->data_ptr() : nullptr, - args.lda, - args.mata->scalar_type(), - args.matb->data_ptr(), - scale_b ? scale_b->data_ptr() : nullptr, - args.ldb, - args.matb->scalar_type(), - bias ? bias->data_ptr(): nullptr, - bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_, - args.result->data_ptr(), - scale_result ? scale_result->data_ptr() : nullptr, - args.result_ld, - out_dtype_, - amax.data_ptr(), - use_fast_accum); +#ifdef USE_ROCM + auto tuning_ctx = at::cuda::tunable::getTuningContext(); + if (tuning_ctx->IsTunableOpEnabled()) { +#define TUNABLE_DISPATCH(BLASOP_A, BLASOP_B) \ + if (mat1.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e4m3fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } \ + else if (mat1.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + if (mat2.scalar_type() == ScalarType::Float8_e4m3fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e4m3fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + else if (mat2.scalar_type() == ScalarType::Float8_e5m2fnuz) { \ + static at::cuda::tunable::ScaledGemmTunableOp< \ + at::Float8_e5m2fnuz, at::Float8_e5m2fnuz, scalar_t, \ + BLASOP_A, BLASOP_B> scaledgemm{}; \ + scaledgemm(¶ms); \ + } \ + } + AT_DISPATCH_V2(out_dtype_, "_tunable_scaled_gemm", AT_WRAP([&] { + bool transa_ = ((args.transa != 'n') && (args.transa != 'N')); + bool transb_ = ((args.transb != 'n') && (args.transb != 'N')); + at::cuda::tunable::ScaledGemmParams params; + params.transa = args.transa; + params.transb = args.transb; + params.m = args.m; + params.n = args.n; + params.k = args.k; + params.a = args.mata->data_ptr(); + params.a_scale_ptr = scale_a ? scale_a->data_ptr() : nullptr; + params.lda = args.lda; + params.a_dtype = args.mata->scalar_type(); + params.b = args.matb->data_ptr(); + params.b_scale_ptr = scale_b ? scale_b->data_ptr() : nullptr; + params.ldb = args.ldb; + params.b_dtype = args.matb->scalar_type(); + params.bias_ptr = bias ? bias->data_ptr(): nullptr; + params.bias_dtype = bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_; + params.c = args.result->data_ptr(); + params.c_scale_ptr = scale_result ? scale_result->data_ptr() : nullptr; + params.ldc = args.result_ld; + params.c_dtype = out_dtype_; + params.amax_ptr = amax.data_ptr(); + params.use_fast_accum = use_fast_accum; + if (transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::T) + } + else if (transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::T, at::cuda::tunable::BlasOp::N) + } + else if (!transa_ && transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::T) + } + else if (!transa_ && !transb_) { + TUNABLE_DISPATCH(at::cuda::tunable::BlasOp::N, at::cuda::tunable::BlasOp::N) + } + else { + TORCH_CHECK(false, "unreachable"); + } + }), + kHalf, kBFloat16, kFloat8_e4m3fnuz, kFloat8_e5m2fnuz, AT_EXPAND(AT_FLOATING_TYPES)); +#undef TUNABLE_DISPATCH + } + else +#endif + { + at::cuda::blas::scaled_gemm( + args.transa, + args.transb, + args.m, + args.n, + args.k, + args.mata->data_ptr(), + scale_a ? scale_a->data_ptr() : nullptr, + args.lda, + args.mata->scalar_type(), + args.matb->data_ptr(), + scale_b ? scale_b->data_ptr() : nullptr, + args.ldb, + args.matb->scalar_type(), + bias ? bias->data_ptr(): nullptr, + bias ? bias->scalar_type() : isFloat8Type(out_dtype_) ? at::ScalarType::Half : out_dtype_, + args.result->data_ptr(), + scale_result ? scale_result->data_ptr() : nullptr, + args.result_ld, + out_dtype_, + amax.data_ptr(), + use_fast_accum); + } #else TORCH_CHECK(false, "_scaled_mm_out_cuda is not compiled for this platform."); #endif +#if defined(USE_ROCM) && ROCM_VERSION >= 60000 + // rocm's hipblaslt does not yet support amax, so calculate separately + amax = at::max(at::abs(out.to(kFloat))); +#endif + return {out, amax}; } diff --git a/aten/src/ATen/native/cuda/Bucketization.cu b/aten/src/ATen/native/cuda/Bucketization.cu index c85c059f91c01..05d5421b046f8 100644 --- a/aten/src/ATen/native/cuda/Bucketization.cu +++ b/aten/src/ATen/native/cuda/Bucketization.cu @@ -18,7 +18,7 @@ namespace at::native { // Implement a numpy like searchsorted and a TF like bucketize function running on cuda -// See details in ATen/nativate/Bucketization.cpp +// See details in ATen/native/Bucketization.cpp namespace { @@ -149,7 +149,7 @@ Tensor& searchsorted_out_cuda( return result; } - // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing the original result tensor + // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining the original result tensor Tensor out = result; if (!result.is_contiguous()) { out = result.contiguous(); diff --git a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh index 39b8a5bab4b7a..e764cc4ce8039 100644 --- a/aten/src/ATen/native/cuda/CUDAJitLoops.cuh +++ b/aten/src/ATen/native/cuda/CUDAJitLoops.cuh @@ -18,7 +18,6 @@ #include #include #include -#include #include #include diff --git a/aten/src/ATen/native/cuda/CUDALoops.cuh b/aten/src/ATen/native/cuda/CUDALoops.cuh index b7e4026eabb6b..b8eb85fd4eb2e 100644 --- a/aten/src/ATen/native/cuda/CUDALoops.cuh +++ b/aten/src/ATen/native/cuda/CUDALoops.cuh @@ -39,7 +39,6 @@ #include #include #include -#include #include #ifdef __NVCC__ @@ -303,6 +302,20 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { bool contiguous = iter.is_contiguous(); if (contiguous) { +#ifdef USE_ROCM + at::detail::Array dtypes; + auto inner_strides = iter.get_inner_strides(); + at::detail::Array strides; + for (int i = 0; i < ntensors; i++) { + dtypes[i] = iter.dtype(i); + strides[i] = inner_strides[i]; + } + launch_legacy_kernel<512, 1>(numel, [=]GPU_LAMBDA(int idx) { + void* out = data[0] + strides[0] * idx; + arg0_t result = invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx); + c10::cast_and_store(dtypes[0], out, result); + }); +#else auto loader = memory::LoadWithCast(iter); auto storer = memory::StoreWithCast<1>(iter); auto input_offset_calculator = TrivialOffsetCalculator(); @@ -315,6 +328,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { output_offset_calculator, loader, storer); +#endif } else { at::detail::Array dtypes; for (int i = 0; i < ntensors; i++) { @@ -324,8 +338,7 @@ void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { launch_legacy_kernel<128, 4>(numel, [=] GPU_LAMBDA(int idx) { auto offsets = offset_calc.get(idx); void* out = data[0] + offsets[0]; - arg0_t result = - invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1); + arg0_t result = invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1); c10::cast_and_store(dtypes[0], out, result); }); } diff --git a/aten/src/ATen/native/cuda/CUDAScalar.cu b/aten/src/ATen/native/cuda/CUDAScalar.cu index 8f5208ab59194..428c80a7e0e19 100644 --- a/aten/src/ATen/native/cuda/CUDAScalar.cu +++ b/aten/src/ATen/native/cuda/CUDAScalar.cu @@ -1,6 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -16,10 +17,19 @@ Scalar _local_scalar_dense_cuda(const Tensor& self) { Scalar r; AT_DISPATCH_V2( self.scalar_type(), "_local_scalar_dense_cuda", AT_WRAP([&] { - scalar_t value; + // Create pinned memory for the scalar value to avoid implicit + // locking/sync in cuda library due to pageable memory + auto value = at::detail::empty_cpu( + {1}, /* size */ + c10::CppTypeToScalarType(), /* dtype */ + c10::nullopt, /* layout */ + c10::nullopt, /* device */ + true, /* pin_memory */ + c10::nullopt /* memory format */ + ); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - at::cuda::memcpy_and_sync(&value, self.const_data_ptr(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream); - r = Scalar(value); + at::cuda::memcpy_and_sync((void *)value.const_data_ptr(), self.const_data_ptr(), sizeof(scalar_t), cudaMemcpyDeviceToHost, stream); + r = Scalar(*value.const_data_ptr()); }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBool, kBFloat16, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); return r; } diff --git a/aten/src/ATen/native/cuda/CompareEQKernel.cu b/aten/src/ATen/native/cuda/CompareEQKernel.cu index 9966c3b085050..9496ae95d13b2 100644 --- a/aten/src/ATen/native/cuda/CompareEQKernel.cu +++ b/aten/src/ATen/native/cuda/CompareEQKernel.cu @@ -33,7 +33,7 @@ C10_NOINLINE void compare_eq_ne_kernel(TensorIteratorBase &iter, EqOpType op) { AT_DISPATCH_V2(iter.common_dtype(), "compare_eq_ne_cuda", AT_WRAP([&]() { opmath_symmetric_gpu_kernel_with_scalars( iter, CompareEqFunctor(op)); - }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, kFloat8_e4m3fn, kFloat8_e5m2, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); + }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kHalf, kBFloat16, kBool, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } void eq_kernel_cuda(TensorIteratorBase& iter) { diff --git a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu index a405e93b1e034..9e45e2693cb0f 100644 --- a/aten/src/ATen/native/cuda/ConvolutionMM2d.cu +++ b/aten/src/ATen/native/cuda/ConvolutionMM2d.cu @@ -173,7 +173,7 @@ void slow_conv2d_forward( "slow_conv2d_cuda", [&] { // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: + // Matrix multiply per output: auto input_n = input.select(0, elt); auto output_n = output.select(0, elt); @@ -255,7 +255,7 @@ void slow_conv2d_backward( "slow_conv2d_backward_cuda", [&] { // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per sample: + // Matrix multiply per sample: auto grad_input_n = grad_input.select(0, elt); auto grad_output_n = grad_output.select(0, elt); @@ -327,10 +327,10 @@ void slow_conv2d_grad_weight( "slow_conv2d_grad_weight_cuda", [&] { // For each elt in batch, do: for (int elt = 0; elt < batchSize; elt ++) { - // Matrix mulitply per output: + // Matrix multiply per output: auto grad_output_n = grad_output.select(0, elt); - // Matrix mulitply per output: + // Matrix multiply per output: auto input_n = input.select(0, elt); if (requires_columns) { diff --git a/aten/src/ATen/native/cuda/Copy.cu b/aten/src/ATen/native/cuda/Copy.cu index 81149085354da..fad81d59d45c9 100644 --- a/aten/src/ATen/native/cuda/Copy.cu +++ b/aten/src/ATen/native/cuda/Copy.cu @@ -20,7 +20,7 @@ #include #include -// TODO(NS): Investigate why FP8 conversion intrisncs end up being slower +// TODO(NS): Investigate why FP8 conversion intrinsics end up being slower #ifdef AT_USE_NV_CVT_INTRINSICS #include #endif @@ -35,7 +35,6 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) { ScalarType other_dtype = iter.dtype(1); if (dtype == kFloat8_e4m3fn) { switch (other_dtype) { -#if !defined(USE_ROCM) case kFloat: gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) { return Float8_e4m3fn(value); @@ -51,14 +50,12 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) { return Float8_e4m3fn(value); }); break; -#endif /* !defined(USE_ROCM) */ default: gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fn x) { return x; }); break; } } else if (dtype == kFloat8_e5m2) { switch (other_dtype) { -#if !defined(USE_ROCM) case kFloat: gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) { #ifdef AT_USE_NV_CVT_INTRINSICS @@ -89,11 +86,52 @@ void float8_copy_kernel_cuda(TensorIteratorBase &iter) { #endif }); break; -#endif /* !defined(USE_ROCM) */ default: gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2 x) { return x; }); break; } + } else if (dtype == kFloat8_e4m3fnuz) { + switch (other_dtype) { + case kFloat: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) { + return Float8_e4m3fnuz(value); + }); + break; + case kHalf: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) { + return Float8_e4m3fnuz(value); + }); + break; + case kBFloat16: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) { + return Float8_e4m3fnuz(value); + }); + break; + default: + gpu_kernel(iter, [] GPU_LAMBDA(Float8_e4m3fnuz x) { return x; }); + break; + } + } else if (dtype == kFloat8_e5m2fnuz) { + switch (other_dtype) { + case kFloat: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(float value) { + return Float8_e5m2fnuz(value); + }); + break; + case kHalf: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(Half value) { + return Float8_e5m2fnuz(value); + }); + break; + case kBFloat16: + gpu_kernel_nocast(iter, [] GPU_LAMBDA(BFloat16 value) { + return Float8_e5m2fnuz(value); + }); + break; + default: + gpu_kernel(iter, [] GPU_LAMBDA(Float8_e5m2fnuz x) { return x; }); + break; + } } else { TORCH_CHECK(false, "This supposed ot be called only for Float8 types"); } @@ -107,16 +145,14 @@ void direct_copy_kernel_cuda(TensorIteratorBase &iter) { AT_DISPATCH_QINT_TYPES(dtype, "copy_", [&] { gpu_kernel(iter, [] GPU_LAMBDA(scalar_t x) { return x; }); }); - } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn) { + } else if (dtype == kFloat8_e5m2 || dtype == kFloat8_e4m3fn || dtype == kFloat8_e5m2fnuz || dtype == kFloat8_e4m3fnuz) { float8_copy_kernel_cuda(iter); -#if !defined(USE_ROCM) } else if (isBitsType(dtype)) { TORCH_CHECK(dtype == iter.dtype(1), "copy_() does not support casting " "bits types to different bits types. Source dtype is ", iter.dtype(1), "target dtype is ", dtype); AT_DISPATCH_BIT_TYPES(dtype, "copy_", [&] { gpu_kernel_nocast(iter, [] GPU_LAMBDA(scalar_t x) { return x; }); }); -#endif /* !defined(USE_ROCM) */ } else { AT_DISPATCH_V2( dtype, "copy_", AT_WRAP([&] { @@ -266,9 +302,11 @@ static void copy_kernel_cuda(TensorIterator& iter, bool non_blocking) { Tensor src_contig; // If non_blocking is true - type conversions are performed on the GPU - // for CPU-GPU copies, otherwise type conversions are performed on the CPU. - // Type conversions are performed on the src device for GPU-GPU copies. - if (iter.device_type(0) == kCUDA || non_blocking) { + // For blocking transfers conversions are performed on CPU to avoid allocating + // extra GPU memory + // for GPU-GPU transfers conversions are performed on the source device + auto conversion_device = non_blocking ? kCUDA : kCPU; + if (iter.device_type(1) == conversion_device) { dst_contig = dst.is_contiguous() ? dst : at::empty_like(dst, LEGACY_CONTIGUOUS_MEMORY_FORMAT); src_contig = iter.tensor(1).to(iter.dtype(0)).expand_as(dst).contiguous(); } else { diff --git a/aten/src/ATen/native/cuda/CrossKernel.cu b/aten/src/ATen/native/cuda/CrossKernel.cu index 956ce2446dc18..560d419c982b5 100644 --- a/aten/src/ATen/native/cuda/CrossKernel.cu +++ b/aten/src/ATen/native/cuda/CrossKernel.cu @@ -68,8 +68,8 @@ void cross_impl(const Tensor& result, const Tensor& x1, const Tensor& x2, int64_ auto iter = TensorIteratorConfig() .add_output(result) - .add_input(x1) - .add_input(x2) + .add_const_input(x1) + .add_const_input(x2) .resize_outputs(false) .declare_static_shape(result.sizes(), /*squash_dims=*/dim) .build(); diff --git a/aten/src/ATen/native/cuda/CuFFTPlanCache.h b/aten/src/ATen/native/cuda/CuFFTPlanCache.h index edeb8e8c82f80..6bcd57027d517 100644 --- a/aten/src/ATen/native/cuda/CuFFTPlanCache.h +++ b/aten/src/ATen/native/cuda/CuFFTPlanCache.h @@ -123,11 +123,7 @@ static bool is_pow_of_two(int64_t x) { return (x & (x - 1)) == 0; } -#if defined(USE_ROCM) - using cufft_size_type = int; -#else - using cufft_size_type = long long int; -#endif +using cufft_size_type = long long int; using CuFFTDimVector = c10::SmallVector; @@ -299,25 +295,6 @@ class CuFFTConfig { // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. const bool simple_layout = in_layout.simple && out_layout.simple; - -#if defined(USE_ROCM) - hipfftType exec_type = [&]{ - if (dtype == kFloat) { - switch (fft_type) { - case CuFFTTransformType::C2C: return HIPFFT_C2C; - case CuFFTTransformType::R2C: return HIPFFT_R2C; - case CuFFTTransformType::C2R: return HIPFFT_C2R; - } - } else if (dtype == kDouble) { - switch (fft_type) { - case CuFFTTransformType::C2C: return HIPFFT_Z2Z; - case CuFFTTransformType::R2C: return HIPFFT_D2Z; - case CuFFTTransformType::C2R: return HIPFFT_Z2D; - } - } - TORCH_CHECK(false, "hipFFT doesn't support transforms of type: ", dtype); - }(); -#else cudaDataType itype, otype, exec_type; const auto complex_input = cufft_complex_input(fft_type); const auto complex_output = cufft_complex_output(fft_type); @@ -336,7 +313,6 @@ class CuFFTConfig { } else { TORCH_CHECK(false, "cuFFT doesn't support tensor of type: ", dtype); } -#endif // disable auto allocation of workspace to use THC allocator CUFFT_CHECK(cufftSetAutoAllocation(plan(), /* autoAllocate */ 0)); @@ -350,29 +326,15 @@ class CuFFTConfig { // by assuming istride = ostride = 1. // // See NOTE [ cuFFT Embedded Strides ] in native/cuda/SpectralOps.cu. -#if defined(USE_ROCM) - CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), - /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, - /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, - exec_type, batch, &ws_size_t)); -#else CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), /* inembed */ nullptr, /* base_istride */ 1, /* idist */ 1, itype, /* onembed */ nullptr, /* base_ostride */ 1, /* odist */ 1, otype, batch, &ws_size_t, exec_type)); -#endif } else { -#if defined(USE_ROCM) - CUFFT_CHECK(hipfftMakePlanMany(plan(), signal_ndim, signal_sizes.data(), - in_layout.embed.data(), in_layout.stride, in_layout.dist, - out_layout.embed.data(), out_layout.stride, out_layout.dist, - exec_type, batch, &ws_size_t)); -#else CUFFT_CHECK(cufftXtMakePlanMany(plan(), signal_ndim, signal_sizes.data(), in_layout.embed.data(), in_layout.stride, in_layout.dist, itype, out_layout.embed.data(), out_layout.stride, out_layout.dist, otype, batch, &ws_size_t, exec_type)); -#endif } ws_size = static_cast(ws_size_t); } diff --git a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu index cfa0e8a029ed1..69757df220886 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv2d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv2d.cu @@ -32,10 +32,10 @@ PackedTensorAccessor32 dummy_packed_accessor32() { template __global__ void conv_depthwise2d_forward_kernel( - const PackedTensorAccessor32 input, + const PackedTensorAccessor32 input, PackedTensorAccessor32 output, - const PackedTensorAccessor32 weight, - const PackedTensorAccessor32 bias, + const PackedTensorAccessor32 weight, + const PackedTensorAccessor32 bias, bool biasEnabled, index_t totalElements, const int outputChannels, @@ -103,9 +103,9 @@ __global__ void conv_depthwise2d_forward_kernel( template __global__ void conv_depthwise2d_backward_kernel( - const PackedTensorAccessor32 grad_output, + const PackedTensorAccessor32 grad_output, PackedTensorAccessor32 grad_input, - const PackedTensorAccessor32 weight, + const PackedTensorAccessor32 weight, index_t totalElements, const int inputChannels, const int depthwiseMultiplier, @@ -174,8 +174,8 @@ __global__ void conv_depthwise2d_backward_kernel( template __global__ void conv_depthwise2d_grad_weight_kernel( - const PackedTensorAccessor32 grad_output, - const PackedTensorAccessor32 input, + const PackedTensorAccessor32 grad_output, + const PackedTensorAccessor32 input, PackedTensorAccessor32 grad_weight, const int batchSize, const int inputChannels, @@ -309,12 +309,12 @@ void conv_depthwise2d_forward_out( // Create PackedTensorAccessor // Kernel currently relies upon all the Tensors to be contiguous, but we made // them contiguous above - const auto input_a = input.packed_accessor32(); - const auto weight_a = weight.packed_accessor32(); + const auto input_a = input.packed_accessor32(); + const auto weight_a = weight.packed_accessor32(); const auto output_a = output.packed_accessor32(); const auto bias_a = has_bias ? - bias.packed_accessor32() : - dummy_packed_accessor32(); + bias.packed_accessor32() : + dummy_packed_accessor32(); if (kW == 3 && kH == 3) { conv_depthwise2d_forward_kernel<3> <<>>( input_a, output_a, weight_a, bias_a, has_bias, n, outputChannels, depthwiseMultiplier, @@ -387,9 +387,9 @@ void conv_depthwise2d_backward_out( const auto stream = c10::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(), "conv_depthwise2d_backward_cuda", [&] { - auto grad_output_a = grad_output.packed_accessor32(); + auto grad_output_a = grad_output.packed_accessor32(); auto grad_input_a = grad_input.packed_accessor32(); - auto weight_a = weight.packed_accessor32(); + auto weight_a = weight.packed_accessor32(); if (kW == 3 && kH == 3) { if (dW == 1 && dH == 1){ @@ -501,8 +501,8 @@ void conv_depthwise2d_grad_weight_out( AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, grad_output.scalar_type(), "conv_depthwise2d_grad_weight_cuda", [&] { - const auto grad_output_a = grad_output.packed_accessor32(); - const auto input_a = input.packed_accessor32(); + const auto grad_output_a = grad_output.packed_accessor32(); + const auto input_a = input.packed_accessor32(); const auto grad_weight_a = grad_weight.packed_accessor32(); using acc_t = at::acc_type; int warp_size = at::cuda::warp_size(); diff --git a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu index 631c2677900cd..991471a6ef82f 100644 --- a/aten/src/ATen/native/cuda/DepthwiseConv3d.cu +++ b/aten/src/ATen/native/cuda/DepthwiseConv3d.cu @@ -26,9 +26,9 @@ template __global__ void conv_depthwise3d_cuda_kernel( - const PackedTensorAccessor32 input, + const PackedTensorAccessor32 input, PackedTensorAccessor32 output, - const PackedTensorAccessor32 kernel, + const PackedTensorAccessor32 kernel, const scalar_t* bias, int strideT, int strideH, int strideW, int paddingT, int paddingH, int paddingW, @@ -99,9 +99,9 @@ template __global__ void conv_depthwise3d_cuda_backward_input_kernel( - const PackedTensorAccessor32 grad_output, + const PackedTensorAccessor32 grad_output, PackedTensorAccessor32 grad_input, - const PackedTensorAccessor32 kernel, + const PackedTensorAccessor32 kernel, int strideT_, int strideH_, int strideW_, int paddingT, int paddingH, int paddingW, int dilationT_, int dilationH_, int dilationW_) { @@ -180,8 +180,8 @@ template __global__ void conv_depthwise3d_cuda_backward_weight_kernel( - const PackedTensorAccessor32 grad_output, - const PackedTensorAccessor32 input, + const PackedTensorAccessor32 grad_output, + const PackedTensorAccessor32 input, PackedTensorAccessor32 grad_kernel, int strideT, int strideH_, int strideW_, int paddingT, int paddingH, int paddingW, @@ -361,9 +361,9 @@ void conv_depthwise_shape_check( conv_depthwise3d_cuda_kernel \ \ <<>>( \ - input_.packed_accessor32(), \ + input_.packed_accessor32(), \ output_.packed_accessor32(), \ - weight_.packed_accessor32(), \ + weight_.packed_accessor32(), \ bias_ptr, \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ @@ -377,9 +377,9 @@ void conv_depthwise_shape_check( conv_depthwise3d_cuda_kernel \ \ <<>>( \ - input_.packed_accessor32(), \ + input_.packed_accessor32(), \ output_.packed_accessor32(), \ - weight_.packed_accessor32(), \ + weight_.packed_accessor32(), \ bias_ptr, \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ @@ -470,9 +470,9 @@ Tensor conv_depthwise3d_cuda( conv_depthwise3d_cuda_backward_input_kernel \ \ <<>>( \ - grad_output_.packed_accessor32(), \ + grad_output_.packed_accessor32(), \ grad_input_.packed_accessor32(), \ - weight_.packed_accessor32(), \ + weight_.packed_accessor32(), \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ dilation[0], dilation[1], dilation[2]); \ @@ -485,9 +485,9 @@ Tensor conv_depthwise3d_cuda( conv_depthwise3d_cuda_backward_input_kernel \ \ <<>>( \ - grad_output_.packed_accessor32(), \ + grad_output_.packed_accessor32(), \ grad_input_.packed_accessor32(), \ - weight_.packed_accessor32(), \ + weight_.packed_accessor32(), \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ dilation[0], dilation[1], dilation[2]); \ @@ -500,8 +500,8 @@ Tensor conv_depthwise3d_cuda( conv_depthwise3d_cuda_backward_weight_kernel \ \ <<>>( \ - grad_output_.packed_accessor32(), \ - input_.packed_accessor32(), \ + grad_output_.packed_accessor32(), \ + input_.packed_accessor32(), \ grad_weight.packed_accessor32(), \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ @@ -515,8 +515,8 @@ Tensor conv_depthwise3d_cuda( conv_depthwise3d_cuda_backward_weight_kernel \ \ <<>>( \ - grad_output_.packed_accessor32(), \ - input_.packed_accessor32(), \ + grad_output_.packed_accessor32(), \ + input_.packed_accessor32(), \ grad_weight.packed_accessor32(), \ stride[0], stride[1], stride[2], \ padding[0], padding[1], padding[2], \ diff --git a/aten/src/ATen/native/cuda/DistributionTemplates.h b/aten/src/ATen/native/cuda/DistributionTemplates.h index 04a278d83f763..8ac91f3114511 100644 --- a/aten/src/ATen/native/cuda/DistributionTemplates.h +++ b/aten/src/ATen/native/cuda/DistributionTemplates.h @@ -618,7 +618,7 @@ void bernoulli_tensor_cuda_kernel( }; // The template argument `4` below indicates that we want to operate on four // element at each time. See NOTE [ CUDA_tensor_applyN helpers ] for details. - at::cuda::CUDA_tensor_apply2(ret, p, functor); } diff --git a/aten/src/ATen/native/cuda/Dropout.cu b/aten/src/ATen/native/cuda/Dropout.cu index 67ea3e4f832b3..a749872ba38f3 100644 --- a/aten/src/ATen/native/cuda/Dropout.cu +++ b/aten/src/ATen/native/cuda/Dropout.cu @@ -45,7 +45,7 @@ template < C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void -fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, +fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, at::cuda::detail::TensorInfo b, at::cuda::detail::TensorInfo c, IndexType totalElements, accscalar_t p, @@ -103,7 +103,7 @@ fused_dropout_kernel_vec(at::cuda::detail::TensorInfo a, // and replace IndexToOffset call with linearIndex to allow vectorization of NHWC (or other) // ordering. // Single vectorized load - *value = *reinterpret_cast(&a.data[linearIndex]); + *value = *reinterpret_cast(&a.data[linearIndex]); scalar_t r[VEC]; mask_t mask[VEC]; @@ -133,7 +133,7 @@ template < C10_LAUNCH_BOUNDS_2(256, 4) #endif __global__ void -fused_dropout_kernel(cuda::detail::TensorInfo a, +fused_dropout_kernel(cuda::detail::TensorInfo a, cuda::detail::TensorInfo b, cuda::detail::TensorInfo c, IndexType totalElements, accscalar_t p, @@ -164,7 +164,7 @@ fused_dropout_kernel(cuda::detail::TensorInfo a, if (li < totalElements) { // Convert `linearIndex` into an offset of `a` const IndexType aOffset = - cuda::detail::IndexToOffset::get(li, a); + cuda::detail::IndexToOffset::get(li, a); src[ii] = a.data[aOffset]; } } @@ -187,8 +187,8 @@ void masked_scale_kernel(at::Tensor& ret, const at::Tensor& src, const at::Tenso auto iter = at::TensorIteratorConfig() .check_all_same_dtype(false) .add_output(ret) - .add_input(src) - .add_input(mask) + .add_const_input(src) + .add_const_input(mask) .build(); at::native::gpu_kernel( @@ -205,7 +205,7 @@ int get_vector_size(at::Tensor self, at::Tensor ret, at::Tensor mask) { if (!self.is_non_overlapping_and_dense() || !ret.is_non_overlapping_and_dense() || !mask.is_non_overlapping_and_dense()) { vec_size = 1; } else { - vec_size = memory::can_vectorize_up_to((char*)self.data_ptr()); + vec_size = memory::can_vectorize_up_to((const char*)self.const_data_ptr()); } // check that we'd have no remainders - prefer a smaller vector size with no remainders over a larger vector and remainder. @@ -236,7 +236,7 @@ inline void launcher( using accscalar_t = acc_type; accscalar_t pa = (accscalar_t)(p); auto self_info = - cuda::detail::getTensorInfo(self); + cuda::detail::getTensorInfo(self); auto ret_info = cuda::detail::getTensorInfo(ret); auto mask_info = diff --git a/aten/src/ATen/native/cuda/Embedding.cu b/aten/src/ATen/native/cuda/Embedding.cu index 92eb4bbbb4929..b8fb51304e4b0 100644 --- a/aten/src/ATen/native/cuda/Embedding.cu +++ b/aten/src/ATen/native/cuda/Embedding.cu @@ -150,7 +150,7 @@ __global__ void embedding_backward_kernel( // 5 // 8 - // Number of values proceessed by each thread (grain size) + // Number of values processed by each thread (grain size) const int SZ = 4; if (idx < numel diff --git a/aten/src/ATen/native/cuda/EmbeddingBag.cu b/aten/src/ATen/native/cuda/EmbeddingBag.cu index 52bb16b13c5bb..64852ae79b1f9 100644 --- a/aten/src/ATen/native/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/cuda/EmbeddingBag.cu @@ -538,7 +538,7 @@ Tensor _embedding_bag_per_sample_weights_backward_cuda( auto output = at::empty({num_samples}, grad.options()); - // Early return when there is no samples in the batch. This saves unnecesary kernel + // Early return when there is no samples in the batch. This saves unnecessary kernel // launch, but also prevents cudaGetLastError() to complain about invalid launch args if (num_samples == 0) { return output; diff --git a/aten/src/ATen/native/cuda/FillKernel.cu b/aten/src/ATen/native/cuda/FillKernel.cu index e7e1237a6f412..dc2ecf2db35b6 100644 --- a/aten/src/ATen/native/cuda/FillKernel.cu +++ b/aten/src/ATen/native/cuda/FillKernel.cu @@ -22,7 +22,7 @@ struct FillFunctor { void fill_kernel_cuda(TensorIterator& iter, const Scalar& value) { AT_DISPATCH_V2(iter.dtype(), "fill_cuda", AT_WRAP([&]() { gpu_kernel(iter, FillFunctor(value.to())); - }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kBool, kHalf, kBFloat16, kFloat8_e4m3fn, kFloat8_e5m2, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); + }), AT_EXPAND(AT_ALL_TYPES_AND_COMPLEX), kComplexHalf, kBool, kHalf, kBFloat16, AT_EXPAND(AT_FLOAT8_TYPES), AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } REGISTER_DISPATCH(fill_stub, &fill_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/ForeachReduceOp.cu b/aten/src/ATen/native/cuda/ForeachReduceOp.cu index d8af951afa701..eed96563efcdc 100644 --- a/aten/src/ATen/native/cuda/ForeachReduceOp.cu +++ b/aten/src/ATen/native/cuda/ForeachReduceOp.cu @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -20,16 +21,33 @@ namespace at::native { +// _foreach_norm supports only L1, L2, and inf norm +enum class NormType { L1, L2, LInf }; + +// NOTE: This is a simple variant of TensorListMetadata in MultiTensorApply.cuh +// as we only need to track addresses for the lpnorm_cleanup function below. +// Why is this struct necessary? For the same reason the TensorListMetadata +// struct is necessary--which is to ferry static metadata to the CUDA kernel +// while complying with the 4kb size constraint. Since we only need to track +// addresses, we introduce this struct to be able to fit more Tensor pointers at +// a time, currently 400 empirically, compared to the much smaller values in +// depth_to_max_tensors. This way, we can launch fewer kernels for better +// performance. +// +// IF YOU USE THIS STRUCT, PLEASE ADD A ONE-OFF TEST IN test_foreach.py AS THIS +// IS CURRENTLY ONLY TESTED FOR _foreach_norm. +const size_t MAX_TENSORS_PER_KERNEL = 400; +struct TensorListAddresses { + const void* addresses[MAX_TENSORS_PER_KERNEL]; +}; + template < typename T, - int NormType, + NormType norm_type, int depth = 1, int r_args_depth = 1, int res_arg_index = 0> struct LpNormFunctor { - static_assert( - NormType == 1 || NormType == 2, - "foreach_norm supports only L1 and L2 norm"); using opmath_t = typename at::opmath_type; __device__ __forceinline__ void operator()( int chunk_size, @@ -47,7 +65,7 @@ struct LpNormFunctor { __shared__ opmath_t s_vals[512]; opmath_t vals[kILP]; T r_x[kILP]; - for (int i = 0; i < kILP; i++) { + for (int64_t i = 0; i < kILP; i++) { vals[i] = opmath_t(0); r_x[i] = T(0); } @@ -61,7 +79,11 @@ struct LpNormFunctor { #pragma unroll for (int ii = 0; ii < kILP; ii++) { opmath_t next = static_cast(r_x[ii]); - vals[ii] += NormType == 1 ? ::abs(next) : next * next; + if constexpr (norm_type == NormType::LInf) { + vals[ii] = max_propagate_nan(vals[ii], ::abs(next)); + } else { + vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next; + } } } } else { @@ -72,7 +94,11 @@ struct LpNormFunctor { int i = i_start + threadIdx.x + ii * blockDim.x; if (i < n && i < chunk_size) { opmath_t next = static_cast(x[i]); - vals[ii] += NormType == 1 ? ::abs(next) : next * next; + if constexpr (norm_type == NormType::LInf) { + vals[ii] = max_propagate_nan(vals[ii], ::abs(next)); + } else { + vals[ii] += norm_type == NormType::L1 ? ::abs(next) : next * next; + } } } } @@ -80,34 +106,52 @@ struct LpNormFunctor { auto val = opmath_t(0); for (int i = 0; i < kILP; i++) { - val += vals[i]; + if constexpr (norm_type == NormType::LInf) { + val = max_propagate_nan(val, vals[i]); + } else { + val += vals[i]; + } } - auto final = at::native::cuda_utils::BlockReduceSum(val, s_vals); + auto final_val = norm_type == NormType::L1 || norm_type == NormType::L2 + ? at::native::cuda_utils::BlockReduceSum(val, s_vals) + : at::native::cuda_utils::BlockReduceMax(val, s_vals); if (threadIdx.x == 0) { output_per_tensor [(tl.start_tensor_this_launch + tensor_loc) * max_chunks_per_tensor + - chunk_idx] = final; + chunk_idx] = final_val; } } }; -template > +template < + typename T, + NormType norm_type, + typename opmath_t = at::opmath_type> __global__ void lpnorm_cleanup( const opmath_t* output_per_tensor, - T* ret_per_tensor, + TensorListAddresses addr_struct, int max_chunks_per_tensor) { __shared__ opmath_t vals[512]; const opmath_t* output_this_tensor = output_per_tensor + blockIdx.x * max_chunks_per_tensor; opmath_t val = 0; - for (int i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) { - val += output_this_tensor[i]; + for (size_t i = threadIdx.x; i < max_chunks_per_tensor; i += blockDim.x) { + if constexpr (norm_type == NormType::LInf) { + val = max_propagate_nan(val, output_this_tensor[i]); + } else { + val += output_this_tensor[i]; + } } - opmath_t final = at::native::cuda_utils::BlockReduceSum(val, vals); + opmath_t final_val = norm_type == NormType::L1 || norm_type == NormType::L2 + ? at::native::cuda_utils::BlockReduceSum(val, vals) + : at::native::cuda_utils::BlockReduceMax(val, vals); if (threadIdx.x == 0) { - ret_per_tensor[blockIdx.x] = NormType == 1 ? final : ::sqrt(final); + *(T*)addr_struct.addresses[blockIdx.x] = + norm_type == NormType::L1 || norm_type == NormType::LInf + ? final_val + : ::sqrt(final_val); } } @@ -135,14 +179,15 @@ std::vector foreach_tensor_norm_cuda( at::isComplexType(scalar_type); }); if (!can_use_fast_route(tensors) || has_int_or_complex || - !(p == static_cast(1) || p == static_cast(2))) { + !(p == static_cast(1) || p == static_cast(2) || + p == std::numeric_limits::infinity())) { return foreach_tensor_norm_slow(tensors, ord); } - const int ntensors = tensors.size(); + const size_t ntensors = tensors.size(); int max_chunks_per_tensor = -1; - for (int t = 0; t < ntensors; t++) { + for (const auto t : c10::irange(ntensors)) { int max_chunks_this_tensor = (tensors[t].numel() + kChunkSize - 1) / kChunkSize; if (max_chunks_this_tensor > max_chunks_per_tensor) { @@ -151,9 +196,14 @@ std::vector foreach_tensor_norm_cuda( } const auto options = tensors[0].options(); auto output_per_tensor = at::zeros( - {ntensors * max_chunks_per_tensor}, + {static_cast(ntensors) * max_chunks_per_tensor}, options.dtype(toOpMathType(tensors[0].scalar_type()))); - auto ret_per_tensor = at::empty({ntensors}, options); + + std::vector vec_res; + vec_res.reserve(ntensors); + for (const auto i : c10::irange(ntensors)) { + vec_res.push_back(at::empty({}, options)); + } auto tensor_lists = std::vector>{tensors.vec()}; if (p == static_cast(1)) { @@ -166,18 +216,35 @@ std::vector foreach_tensor_norm_cuda( using opmath_t = typename at::opmath_type; multi_tensor_apply<1>( tensor_lists, - LpNormFunctor(), + LpNormFunctor(), output_per_tensor.mutable_data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); const at::cuda::OptionalCUDAGuard device_guard( device_of(output_per_tensor)); auto stream = at::cuda::getCurrentCUDAStream(); - lpnorm_cleanup<<>>( - output_per_tensor.const_data_ptr(), - ret_per_tensor.mutable_data_ptr(), - max_chunks_per_tensor); - C10_CUDA_KERNEL_LAUNCH_CHECK(); + + const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL); + for (const auto i : c10::irange(num_kernels)) { + const size_t num_tensors_this_kernel = + (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0) + ? MAX_TENSORS_PER_KERNEL + : (ntensors % MAX_TENSORS_PER_KERNEL); + + TensorListAddresses addr_struct; + for (const auto j : c10::irange(num_tensors_this_kernel)) { + addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j] + .mutable_data_ptr(); + } + + lpnorm_cleanup + <<>>( + output_per_tensor.const_data_ptr() + + i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor, + addr_struct, + max_chunks_per_tensor); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } }); } else if (p == static_cast(2)) { AT_DISPATCH_FLOATING_TYPES_AND2( @@ -189,18 +256,75 @@ std::vector foreach_tensor_norm_cuda( using opmath_t = typename at::opmath_type; multi_tensor_apply<1>( tensor_lists, - LpNormFunctor(), + LpNormFunctor(), output_per_tensor.mutable_data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); const at::cuda::OptionalCUDAGuard device_guard( device_of(output_per_tensor)); auto stream = at::cuda::getCurrentCUDAStream(); - lpnorm_cleanup<<>>( - output_per_tensor.const_data_ptr(), - ret_per_tensor.mutable_data_ptr(), + + const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL); + for (const auto i : c10::irange(num_kernels)) { + const size_t num_tensors_this_kernel = + (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0) + ? MAX_TENSORS_PER_KERNEL + : (ntensors % MAX_TENSORS_PER_KERNEL); + + TensorListAddresses addr_struct; + for (const auto j : c10::irange(num_tensors_this_kernel)) { + addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j] + .mutable_data_ptr(); + } + + lpnorm_cleanup + <<>>( + output_per_tensor.const_data_ptr() + + i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor, + addr_struct, + max_chunks_per_tensor); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } + }); + } else if (p == std::numeric_limits::infinity()) { + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, + kBFloat16, + tensor_lists[0][0].scalar_type(), + "foreach_tensor_norm_cuda", + [&]() { + using opmath_t = typename at::opmath_type; + multi_tensor_apply<1>( + tensor_lists, + LpNormFunctor(), + output_per_tensor.mutable_data_ptr(), max_chunks_per_tensor); C10_CUDA_KERNEL_LAUNCH_CHECK(); + const at::cuda::OptionalCUDAGuard device_guard( + device_of(output_per_tensor)); + auto stream = at::cuda::getCurrentCUDAStream(); + + const size_t num_kernels = ceil_div(ntensors, MAX_TENSORS_PER_KERNEL); + for (const auto i : c10::irange(num_kernels)) { + const size_t num_tensors_this_kernel = + (i < num_kernels - 1 || ntensors % MAX_TENSORS_PER_KERNEL == 0) + ? MAX_TENSORS_PER_KERNEL + : (ntensors % MAX_TENSORS_PER_KERNEL); + + TensorListAddresses addr_struct; + for (const auto j : c10::irange(num_tensors_this_kernel)) { + addr_struct.addresses[j] = vec_res[i * MAX_TENSORS_PER_KERNEL + j] + .mutable_data_ptr(); + } + + lpnorm_cleanup + <<>>( + output_per_tensor.const_data_ptr() + + i * MAX_TENSORS_PER_KERNEL * max_chunks_per_tensor, + addr_struct, + max_chunks_per_tensor); + C10_CUDA_KERNEL_LAUNCH_CHECK(); + } }); } else { TORCH_CHECK( @@ -216,7 +340,7 @@ std::vector foreach_tensor_norm_cuda( int i = 0; for (const auto& t : tensors) { if (t.numel() != 0) { - result.emplace_back(ret_per_tensor[i]); + result.emplace_back(vec_res[i]); i++; } else { result.emplace_back(at::zeros({}, options)); diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu index ff809d108d9ee..d7a118e6a9584 100644 --- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu +++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu @@ -388,9 +388,10 @@ void foreach_tensor_zero_cuda_(TensorList tensors) { std::vector> tensor_lists; tensor_lists.emplace_back(tensors.vec()); - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND3( ScalarType::Half, ScalarType::BFloat16, + ScalarType::Bool, tensors[0].scalar_type(), "foreach_zero_cuda_", [&]() { diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu index 55d4b46364e75..3bc3b6f4cb510 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool2d.cu @@ -47,8 +47,8 @@ template __global__ void fractional_max_pool2d_out_cuda_frame( PackedTensorAccessor output, PackedTensorAccessor indices, - PackedTensorAccessor input, - PackedTensorAccessor samples, + PackedTensorAccessor input, + PackedTensorAccessor samples, int poolSizeH, int poolSizeW) { using accscalar_t = at::acc_type; @@ -103,8 +103,8 @@ __global__ void fractional_max_pool2d_out_cuda_frame( template __global__ void fractional_max_pool2d_backward_out_cuda_frame( PackedTensorAccessor gradInput, - PackedTensorAccessor gradOutput, - PackedTensorAccessor indices) { + PackedTensorAccessor gradOutput, + PackedTensorAccessor indices) { // Output (h, w) point that this thread is responsible for int ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; int plane = blockIdx.y; @@ -186,10 +186,10 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_out_cuda) ( input.scalar_type(), "fractional_max_pool2d_out_cuda_frame", [&] { - auto devInput = input_.packed_accessor64(); + auto devInput = input_.packed_accessor64(); auto devOutput = output_.packed_accessor64(); auto devIndices = indices_.packed_accessor64(); - auto devSamples = randomSamples.packed_accessor64(); + auto devSamples = randomSamples.packed_accessor64(); fractional_max_pool2d_out_cuda_frame <<>>( devOutput, devIndices, devInput, devSamples, @@ -254,7 +254,7 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cuda)( gradInput_.size(0)); dim3 block(outputPlaneSize > 128 ? 128 : outputPlaneSize); - auto devIndices = indices_.packed_accessor64(); + auto devIndices = indices_.packed_accessor64(); AT_DISPATCH_FLOATING_TYPES_AND2( at::ScalarType::Half, at::ScalarType::BFloat16, @@ -262,7 +262,7 @@ TORCH_IMPL_FUNC(fractional_max_pool2d_backward_cuda)( "fractional_max_pool2d_backward_out_cuda_frame", [&] { auto devGradInput = gradInput_.packed_accessor64(); - auto devGradOutput = gradOutput_.packed_accessor64(); + auto devGradOutput = gradOutput_.packed_accessor64(); fractional_max_pool2d_backward_out_cuda_frame <<>>( devGradInput, devGradOutput, devIndices); diff --git a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu index 9873b4da5998a..0bd2f50e12bb7 100644 --- a/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu +++ b/aten/src/ATen/native/cuda/FractionalMaxPool3d.cu @@ -53,10 +53,10 @@ __device__ inline int64_t get_intervals( template __global__ void fractional_max_pool3d_out_frame( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, PackedTensorAccessor64 indices, - PackedTensorAccessor64 samples, + PackedTensorAccessor64 samples, int64_t poolSizeT, int64_t poolSizeH, int64_t poolSizeW) { using accscalar_t = at::acc_type; // Output (t, h, w) point that this thread is responsible for @@ -120,8 +120,8 @@ __global__ void fractional_max_pool3d_out_frame( template __global__ void fractional_max_pool3d_backward_out_frame( PackedTensorAccessor64 gradInput, - PackedTensorAccessor64 gradOutput, - PackedTensorAccessor64 indices) { + PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 indices) { // Output (h, w) point that this thread is responsible for int64_t ourOutputPoint = threadIdx.x + blockIdx.x * blockDim.x; int64_t plane = blockIdx.y; @@ -235,8 +235,8 @@ void fractional_max_pool3d_backward_out_cuda_template( fractional_max_pool3d_backward_out_frame <<>>( gradInput_.packed_accessor64(), - gradOutput_.packed_accessor64(), - indices_.packed_accessor64() + gradOutput_.packed_accessor64(), + indices_.packed_accessor64() ); C10_CUDA_KERNEL_LAUNCH_CHECK(); } @@ -295,10 +295,10 @@ TORCH_IMPL_FUNC(fractional_max_pool3d_out_cuda) ( [&]{ fractional_max_pool3d_out_frame <<>>( - input_.packed_accessor64(), + input_.packed_accessor64(), output_.packed_accessor64(), indices_.packed_accessor64(), - randomSamples.packed_accessor64(), + randomSamples.packed_accessor64(), poolSizeT, poolSizeH, poolSizeW ); C10_CUDA_KERNEL_LAUNCH_CHECK(); diff --git a/aten/src/ATen/native/cuda/FusedSgdKernel.cu b/aten/src/ATen/native/cuda/FusedSgdKernel.cu new file mode 100644 index 0000000000000..36ac7401a2d0b --- /dev/null +++ b/aten/src/ATen/native/cuda/FusedSgdKernel.cu @@ -0,0 +1,427 @@ +#include +#include +#include +#include +#include +#include +#include + +namespace at::native { + +namespace { + +template +C10_DEVICE __forceinline__ void sgd_math( + scalar_t r_args[depth][kILP], + const double weight_decay, + const double momentum, + const float* lr_ptr, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr) { + using opmath_t = at::opmath_type; + const double double_lr = lr_ptr != nullptr ? *lr_ptr : lr; +#pragma unroll + for (int ii = 0; ii < kILP; ii++) { + auto p = static_cast(r_args[0][ii]); + auto g = static_cast(r_args[1][ii]); + if (grad_scale_ptr) { + g /= static_cast(*grad_scale_ptr); + r_args[1][ii] = g; + } + if (maximize) { + g *= -1.0; + } + if (weight_decay != 0) { + g += weight_decay * p; + } + if (depth > 2) { + const auto momentum_buffer = is_first_step + ? g + : (momentum * static_cast(r_args[2][ii]) + + (1 - dampening) * g); + r_args[2][ii] = momentum_buffer; + + if (nesterov) { + g = g + momentum * momentum_buffer; + } else { + g = momentum_buffer; + } + } + p -= double_lr * g; + r_args[0][ii] = p; + } +} + +template +struct FusedSgdMathFunctor { + static_assert( + depth == 2 || depth == 3, + "depth of 2 for SGD w/ momentum == 0, 3 for SGD w/ momentum != 0"); + C10_DEVICE __forceinline__ void operator()( + const int chunk_size, + TensorListMetadata& tl, + const double weight_decay, + const double momentum, + const float* lr_ptr, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const float* grad_scale_ptr, + const float* found_inf_ptr) { + if (found_inf_ptr && *found_inf_ptr == 1) { + return; + } + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + + scalar_t* args[depth]; + scalar_t r_args[depth][kILP]; + const auto all_aligned{ + init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; + const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size; + +#ifndef USE_ROCM + const auto use_faster_load_store = + (n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned; +#else + const auto use_faster_load_store{false}; +#endif + if (use_faster_load_store) { + for (auto i_start = threadIdx.x; + i_start * kILP < n && i_start * kILP < chunk_size; + i_start += blockDim.x) { +#pragma unroll + for (auto i = 0; i < depth; i++) { + load_store(r_args[i], args[i], 0, i_start); + } + sgd_math( + r_args, + weight_decay, + momentum, + lr_ptr, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr); + load_store(args[0], r_args[0], i_start, 0); + if (grad_scale_ptr) { + load_store(args[1], r_args[1], i_start, 0); + } + if (depth > 2) { + load_store(args[2], r_args[2], i_start, 0); + } + } + } else { + for (auto i_start = 0; i_start < n && i_start < chunk_size; + i_start += blockDim.x * kILP) { + load_args(r_args, args, i_start, chunk_size, n); + sgd_math( + r_args, + weight_decay, + momentum, + lr_ptr, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr); + store_args(args[0], r_args[0], i_start, chunk_size, n); + if (grad_scale_ptr) { + store_args(args[1], r_args[1], i_start, chunk_size, n); + } + if (depth > 2) { + store_args(args[2], r_args[2], i_start, chunk_size, n); + } + } + } + } +}; + +void _fused_sgd_with_momentum_kernel_cuda_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + TORCH_CHECK_GT(momentum, 0); + TORCH_CHECK(at::native::check_fast_path_restrictions( + {params, grads, momentum_buffer_list})); + float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + float* lr_ptr = nullptr; + + std::vector> tensor_lists{ + params.vec(), grads.vec(), momentum_buffer_list.vec()}; + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, + kBFloat16, + params[0].scalar_type(), + "fused_sgd_with_momentum_kernel_cuda", + [&]() { + multi_tensor_apply<3>( + tensor_lists, + FusedSgdMathFunctor(), + weight_decay, + momentum, + lr_ptr, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr, + found_inf_ptr); + }); +} + +void _fused_sgd_with_momentum_kernel_cuda_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const at::Tensor& lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + if (lr.is_cpu()) { + _fused_sgd_with_momentum_kernel_cuda_( + params, + grads, + momentum_buffer_list, + weight_decay, + momentum, + lr.item(), + dampening, + nesterov, + maximize, + is_first_step, + grad_scale, + found_inf); + return; + } + TORCH_CHECK_GT(momentum, 0); + TORCH_CHECK(at::native::check_fast_path_restrictions( + {params, grads, momentum_buffer_list})); + if (grad_scale != c10::nullopt) { + TORCH_CHECK( + grad_scale->device() == params[0].device(), + "grad_scale must be on the same GPU device as the params"); + } + if (found_inf != c10::nullopt) { + TORCH_CHECK( + found_inf->device() == params[0].device(), + "found_inf must be on the same GPU device as the params"); + } + TORCH_CHECK( + lr.device() == params[0].device(), + "found_inf must be on the same GPU device as the params"); + float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + + std::vector> tensor_lists{ + params.vec(), grads.vec(), momentum_buffer_list.vec()}; + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, + kBFloat16, + params[0].scalar_type(), + "fused_sgd_with_momentum_kernel_cuda", + [&]() { + multi_tensor_apply<3>( + tensor_lists, + FusedSgdMathFunctor(), + weight_decay, + momentum, + lr.data_ptr(), + 1.0, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale_ptr, + found_inf_ptr); + }); +} + +} // namespace + +void _fused_sgd_kernel_cuda_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const double lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + if (!momentum_buffer_list.empty()) { + _fused_sgd_with_momentum_kernel_cuda_( + params, + grads, + momentum_buffer_list, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale, + found_inf); + return; + } + TORCH_CHECK_EQ(momentum, 0); + TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads})); + if (is_first_step) { + TORCH_WARN_ONCE( + "`is_first_step` argument has no effect when `momentum_buffer_list` is empty"); + } + float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + float* lr_ptr = nullptr; + + std::vector> tensor_lists{params.vec(), grads.vec()}; + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, + kBFloat16, + params[0].scalar_type(), + "fused_sgd_kernel_cuda", + [&]() { + multi_tensor_apply<2>( + tensor_lists, + FusedSgdMathFunctor(), + weight_decay, + momentum, + lr_ptr, + lr, + dampening, + nesterov, + maximize, + /* is_first_step */ false, + grad_scale_ptr, + found_inf_ptr); + }); +} + +void _fused_sgd_kernel_cuda_( + at::TensorList params, + at::TensorList grads, + at::TensorList momentum_buffer_list, + const double weight_decay, + const double momentum, + const at::Tensor& lr, + const double dampening, + const bool nesterov, + const bool maximize, + const bool is_first_step, + const c10::optional& grad_scale, + const c10::optional& found_inf) { + if (!momentum_buffer_list.empty()) { + _fused_sgd_with_momentum_kernel_cuda_( + params, + grads, + momentum_buffer_list, + weight_decay, + momentum, + lr, + dampening, + nesterov, + maximize, + is_first_step, + grad_scale, + found_inf); + return; + } + if (lr.is_cpu()) { + _fused_sgd_kernel_cuda_( + params, + grads, + momentum_buffer_list, + weight_decay, + momentum, + lr.item(), + dampening, + nesterov, + maximize, + is_first_step, + grad_scale, + found_inf); + return; + } + TORCH_CHECK_EQ(momentum, 0); + TORCH_CHECK(at::native::check_fast_path_restrictions({params, grads})); + if (is_first_step) { + TORCH_WARN_ONCE( + "`is_first_step` argument has no effect when `momentum_buffer_list` is empty"); + } + if (grad_scale.has_value()) { + TORCH_CHECK( + grad_scale->device() == params[0].device(), + "grad_scale must be on the same GPU device as the params"); + } + if (found_inf.has_value()) { + TORCH_CHECK( + found_inf->device() == params[0].device(), + "found_inf must be on the same GPU device as the params"); + } + TORCH_CHECK( + lr.device() == params[0].device(), + "found_inf must be on the same GPU device as the params"); + float* grad_scale_ptr = + grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; + float* found_inf_ptr = + found_inf.has_value() ? found_inf->data_ptr() : nullptr; + + std::vector> tensor_lists{params.vec(), grads.vec()}; + AT_DISPATCH_FLOATING_TYPES_AND2( + kHalf, + kBFloat16, + params[0].scalar_type(), + "fused_sgd_kernel_cuda", + [&]() { + multi_tensor_apply<2>( + tensor_lists, + FusedSgdMathFunctor(), + weight_decay, + momentum, + lr.data_ptr(), + 1.0, + dampening, + nesterov, + maximize, + /* is_first_step */ false, + grad_scale_ptr, + found_inf_ptr); + }); +} + +} // namespace at::native diff --git a/aten/src/ATen/native/cuda/GridSampler.cu b/aten/src/ATen/native/cuda/GridSampler.cu index 9d87cbc327114..2c9128eee2217 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cu +++ b/aten/src/ATen/native/cuda/GridSampler.cu @@ -25,8 +25,8 @@ namespace { C10_LAUNCH_BOUNDS_1(256) __global__ void grid_sampler_2d_kernel( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -104,7 +104,7 @@ namespace { index_t ix_nearest = static_cast(std::nearbyint(ix)); index_t iy_nearest = static_cast(std::nearbyint(iy)); - // assign nearest neighor pixel value to output pixel + // assign nearest neighbour pixel value to output pixel auto inp_ptr_NC = input.data + n * inp_sN; auto out_ptr_NCHW = output.data + n * out_sN + h * out_sH + w * out_sW; for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCHW += out_sC) { @@ -155,8 +155,8 @@ namespace { C10_LAUNCH_BOUNDS_1(512) __global__ void grid_sampler_3d_kernel( const index_t nthreads, - TensorInfo input, - TensorInfo grid, + TensorInfo input, + TensorInfo grid, TensorInfo output, const GridSamplerInterpolation interpolation_mode, const GridSamplerPadding padding_mode, @@ -287,7 +287,7 @@ namespace { index_t iy_nearest = static_cast(std::nearbyint(iy)); index_t iz_nearest = static_cast(std::nearbyint(iz)); - // assign nearest neighor pixel value to output pixel + // assign nearest neighbour pixel value to output pixel auto inp_ptr_NC = input.data + n * inp_sN; auto out_ptr_NCDHW = output.data + n * out_sN + d * out_sD + h * out_sH + w * out_sW; for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, out_ptr_NCDHW += out_sC) { @@ -311,9 +311,9 @@ namespace { C10_LAUNCH_BOUNDS_1(256) __global__ void grid_sampler_2d_backward_kernel( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, // initialized to zeros (or unused if input_requires_grad is false) TensorInfo grad_grid, // initialized to empty const GridSamplerInterpolation interpolation_mode, @@ -385,11 +385,11 @@ namespace { scalar_t se = (ix - ix_nw) * (iy - iy_nw); scalar_t gix = static_cast(0), giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; + const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; index_t NC_offset = n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; + const scalar_t *inp_ptr_NC = input.data + n * inp_sN; for (index_t c = 0; c < C; ++c, inp_ptr_NC += inp_sC, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) { - scalar_t gOut = *gOut_ptr_NCHW; + const scalar_t gOut = *gOut_ptr_NCHW; if (input_requires_grad) { // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. @@ -434,8 +434,8 @@ namespace { index_t ix_nearest = static_cast(std::nearbyint(ix)); index_t iy_nearest = static_cast(std::nearbyint(iy)); - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; + // assign nearest neighbour pixel value to output pixel + const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; index_t NC_offset = n * gInp_sN; for (index_t c = 0; c < C; ++c, NC_offset += gInp_sC, gOut_ptr_NCHW += gOut_sC) { // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. @@ -474,12 +474,12 @@ namespace { scalar_t gix = static_cast(0); scalar_t giy = static_cast(0); - scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; + const scalar_t *gOut_ptr_NCHW = grad_output.data + n * gOut_sN + h * gOut_sH + w * gOut_sW; index_t NC_offset = n * gInp_sN; - scalar_t *inp_ptr_NC = input.data + n * inp_sN; + const scalar_t *inp_ptr_NC = input.data + n * inp_sN; for (index_t c = 0; c < C; ++c, gOut_ptr_NCHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC+= inp_sC) { - scalar_t gOut = *gOut_ptr_NCHW; + const scalar_t gOut = *gOut_ptr_NCHW; #pragma unroll 4 for (index_t i = 0; i < 4; ++i) { @@ -517,9 +517,9 @@ namespace { C10_LAUNCH_BOUNDS_1(256) __global__ void grid_sampler_3d_backward_kernel( const index_t nthreads, - TensorInfo grad_output, - TensorInfo input, - TensorInfo grid, + TensorInfo grad_output, + TensorInfo input, + TensorInfo grid, TensorInfo grad_input, // initialized to zeros (or unused if input_requires_grad is false) TensorInfo grad_grid, // initialized to empty const GridSamplerInterpolation interpolation_mode, @@ -630,12 +630,12 @@ namespace { scalar_t bse = (ix - ix_tnw) * (iy - iy_tnw) * (iz - iz_tnw); scalar_t gix = static_cast(0), giy = static_cast(0), giz = static_cast(0); - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; index_t NC_offset; if (input_requires_grad) { NC_offset = n * gInp_sN; } - scalar_t *inp_ptr_NC = input.data + n * inp_sN; + const scalar_t *inp_ptr_NC = input.data + n * inp_sN; // calculate bilinear weighted pixel value and set output pixel for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC, inp_ptr_NC += inp_sC) { scalar_t gOut = *gOut_ptr_NCDHW; @@ -724,8 +724,8 @@ namespace { auto iy_nearest = static_cast(std::nearbyint(iy)); auto iz_nearest = static_cast(std::nearbyint(iz)); - // assign nearest neighor pixel value to output pixel - scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; + // assign nearest neighbour pixel value to output pixel + const scalar_t *gOut_ptr_NCDHW = grad_output.data + n * gOut_sN + d * gOut_sD + h * gOut_sH + w * gOut_sW; index_t NC_offset = n * gInp_sN; for (index_t c = 0; c < C; ++c, gOut_ptr_NCDHW += gOut_sC, NC_offset += gInp_sC) { // calculate and set grad_input. See Note [Passing pointer and offset to fastAtomicAdd]. @@ -768,8 +768,8 @@ void launch_grid_sampler_2d_forward_kernel( grid_sampler_2d_kernel <<>>( static_cast(count), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -779,8 +779,8 @@ void launch_grid_sampler_2d_forward_kernel( grid_sampler_2d_kernel <<>>( count, - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -813,8 +813,8 @@ void launch_grid_sampler_3d_forward_kernel( grid_sampler_3d_kernel <<>>( static_cast(count), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -824,8 +824,8 @@ void launch_grid_sampler_3d_forward_kernel( grid_sampler_3d_kernel <<>>( count, - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(input), + getTensorInfo(grid), getTensorInfo(output), static_cast(interpolation_mode), static_cast(padding_mode), @@ -868,9 +868,9 @@ void launch_grid_sampler_2d_backward_kernel( grid_sampler_2d_backward_kernel <<>>( static_cast(count), - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), @@ -883,9 +883,9 @@ void launch_grid_sampler_2d_backward_kernel( grid_sampler_2d_backward_kernel <<>>( count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), @@ -927,9 +927,9 @@ void launch_grid_sampler_3d_backward_kernel( grid_sampler_3d_backward_kernel <<>>( static_cast(count), - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), @@ -942,9 +942,9 @@ void launch_grid_sampler_3d_backward_kernel( grid_sampler_3d_backward_kernel <<>>( count, - getTensorInfo(grad_output), - getTensorInfo(input), - getTensorInfo(grid), + getTensorInfo(grad_output), + getTensorInfo(input), + getTensorInfo(grid), input_requires_grad ? getTensorInfo(grad_input) : TensorInfo(), getTensorInfo(grad_grid), static_cast(interpolation_mode), diff --git a/aten/src/ATen/native/cuda/GridSampler.cuh b/aten/src/ATen/native/cuda/GridSampler.cuh index a0e3b16c3a43a..731f4d7824bf1 100644 --- a/aten/src/ATen/native/cuda/GridSampler.cuh +++ b/aten/src/ATen/native/cuda/GridSampler.cuh @@ -228,7 +228,7 @@ bool within_bounds_3d(int d, int h, int w, int D, int H, int W) { template static __forceinline__ __device__ scalar_t get_value_bounded( - scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, + const scalar_t *data, scalar_t x, scalar_t y, int W, int H, int sW, int sH, GridSamplerPadding padding_mode, bool align_corners) { diff --git a/aten/src/ATen/native/cuda/IGammaKernel.cu b/aten/src/ATen/native/cuda/IGammaKernel.cu index be3f7fc54a6b3..7102110fb4fd3 100644 --- a/aten/src/ATen/native/cuda/IGammaKernel.cu +++ b/aten/src/ATen/native/cuda/IGammaKernel.cu @@ -450,7 +450,7 @@ __noinline__ __host__ __device__ scalar_t calc_igammac(scalar_t a, scalar_t x) { } // NOTE: this __noinline__ is important -- otherwise, observed compile times significantly -// increase. The same kernel seems to get recompiled mulitple times via gpu_kernel_with_scalars, +// increase. The same kernel seems to get recompiled multiple times via gpu_kernel_with_scalars, // multiple dtypes, etc. template __noinline__ __host__ __device__ scalar_t calc_igamma(scalar_t a, scalar_t x) { diff --git a/aten/src/ATen/native/cuda/IndexKernel.cu b/aten/src/ATen/native/cuda/IndexKernel.cu index 657c0c77b3d67..5682ba2757315 100644 --- a/aten/src/ATen/native/cuda/IndexKernel.cu +++ b/aten/src/ATen/native/cuda/IndexKernel.cu @@ -333,7 +333,7 @@ void take_kernel( // Cannot use `OpaqueType`, as Tensor::data_ptr> is not implemented AT_DISPATCH_INDEX_TYPES(cuda::detail::canUse32BitIndexMath(input) ? ScalarType::Int : ScalarType::Long, "take_cuda_index", [&] { - const auto* __restrict__ indexed_ptr = input.template data_ptr(); + const auto* __restrict__ indexed_ptr = input.template const_data_ptr(); cuda_take_put_kernel(iter, input, [indexed_ptr] __device__(scalar_t& iterated, const index_t offset) { iterated = indexed_ptr[offset]; @@ -385,7 +385,7 @@ void launch_masked_scatter_kernel( .resize_outputs(false) .add_output(self) .add_input(self) - .add_input(mask_cont) + .add_const_input(mask_cont) .add_input(maskPrefixSum) .build(); diff --git a/aten/src/ATen/native/cuda/Indexing.cu b/aten/src/ATen/native/cuda/Indexing.cu index 607cb9e2c9c56..c3eadde686355 100644 --- a/aten/src/ATen/native/cuda/Indexing.cu +++ b/aten/src/ATen/native/cuda/Indexing.cu @@ -414,9 +414,7 @@ static std::tuple __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, int dstAddDim, int srcAddDim, IndexType innerSize, @@ -744,7 +742,7 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo dst, for (IndexType srcIndex = 0; srcIndex < indices.sizes[0]; ++srcIndex) { // Lua indices begin at 1 IndexType dstIndex = - indices.data[cuda::detail::IndexToOffset::get(srcIndex, indices)]; + indices.data[cuda::detail::IndexToOffset::get(srcIndex, indices)]; CUDA_KERNEL_ASSERT(dstIndex < dstAddDimSize); // We stride over the output ignoring the indexed dimension @@ -757,7 +755,7 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo dst, dstOffset += dstIndex * dst.strides[dstAddDim]; IndexType srcOffset = - cuda::detail::IndexToOffset::get(linearIndex, src); + cuda::detail::IndexToOffset::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcAddDim]; T val = src.data[srcOffset] * alpha; @@ -776,8 +774,8 @@ __global__ void indexFuncSmallIndex(cuda::detail::TensorInfo dst, template __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, int dstAddDim, int srcAddDim, IndexType totalSize, @@ -803,7 +801,7 @@ __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo dst, // Lua indices begin at 1 IndexType dstIndex = - indices.data[cuda::detail::IndexToOffset::get(srcIndex, indices)]; + indices.data[cuda::detail::IndexToOffset::get(srcIndex, indices)]; CUDA_KERNEL_ASSERT(dstIndex < dstAddDimSize); IndexType dstOffset = @@ -811,7 +809,7 @@ __global__ void indexFuncLargeIndex(cuda::detail::TensorInfo dst, dstOffset += dstIndex * dst.strides[dstAddDim]; IndexType srcOffset = - cuda::detail::IndexToOffset::get(elementInSlice, src); + cuda::detail::IndexToOffset::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcAddDim]; T val = src.data[srcOffset] * alpha; @@ -933,12 +931,12 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c const auto alpha_value = alpha.to(); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { auto sourceInfo = - cuda::detail::getTensorInfo(source_); + cuda::detail::getTensorInfo(source_); const int sourceAddDim = sourceInfo.collapseDims(dim); sourceInfo.reduceDim(sourceAddDim); auto indexInfo = - cuda::detail::getTensorInfo(index); + cuda::detail::getTensorInfo(index); indexInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -984,14 +982,14 @@ void index_add_cuda_impl(const Tensor& self, int64_t dim, const Tensor& index, c selfInfo.reduceDim(selfAddDim); const auto alpha_value = alpha.to(); - cuda::detail::TensorInfo sourceInfo = - cuda::detail::getTensorInfo(source_); + cuda::detail::TensorInfo sourceInfo = + cuda::detail::getTensorInfo(source_); const int sourceAddDim = sourceInfo.collapseDims(dim); sourceInfo.reduceDim(sourceAddDim); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_add_cuda_", [&] () { - cuda::detail::TensorInfo indexInfo = - cuda::detail::getTensorInfo(index); + cuda::detail::TensorInfo indexInfo = + cuda::detail::getTensorInfo(index); indexInfo.collapseDims(); LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); @@ -1106,12 +1104,12 @@ void index_reduce_func_cuda_impl( auto alpha_value = (scalar_t) 1; AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () { auto sourceInfo = - cuda::detail::getTensorInfo(source_); + cuda::detail::getTensorInfo(source_); int sourceReduceDim = sourceInfo.collapseDims(dim); sourceInfo.reduceDim(sourceReduceDim); auto indexInfo = - cuda::detail::getTensorInfo(index); + cuda::detail::getTensorInfo(index); indexInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -1157,14 +1155,14 @@ void index_reduce_func_cuda_impl( selfInfo.reduceDim(selfReduceDim); auto alpha_value = (scalar_t) 1; - cuda::detail::TensorInfo sourceInfo = - cuda::detail::getTensorInfo(source_); + cuda::detail::TensorInfo sourceInfo = + cuda::detail::getTensorInfo(source_); int sourceReduceDim = sourceInfo.collapseDims(dim); sourceInfo.reduceDim(sourceReduceDim); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_reduce_cuda", [&] () { - cuda::detail::TensorInfo indexInfo = - cuda::detail::getTensorInfo(index); + cuda::detail::TensorInfo indexInfo = + cuda::detail::getTensorInfo(index); indexInfo.collapseDims(); LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); @@ -1221,8 +1219,8 @@ namespace { // parallelism. template __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, int dstSelectDim, int srcSelectDim, IndexType innerSize, @@ -1234,7 +1232,7 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo dst // re-accessing indices in addition to src elements can be slow. for (IndexType dstIndex = 0; dstIndex < indices.sizes[0]; ++dstIndex) { IndexType srcIndex = - indices.data[cuda::detail::IndexToOffset::get(dstIndex, indices)]; + indices.data[cuda::detail::IndexToOffset::get(dstIndex, indices)]; CUDA_KERNEL_ASSERT(srcIndex < srcSelectDimSize); // We stride over the output ignoring the indexed dimension @@ -1247,7 +1245,7 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo dst dstOffset += dstIndex * dst.strides[dstSelectDim]; IndexType srcOffset = - cuda::detail::IndexToOffset::get(linearIndex, src); + cuda::detail::IndexToOffset::get(linearIndex, src); srcOffset += srcIndex * src.strides[srcSelectDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -1264,8 +1262,8 @@ __global__ void indexSelectSmallIndex(cuda::detail::TensorInfo dst template __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo dst, - cuda::detail::TensorInfo src, - cuda::detail::TensorInfo indices, + cuda::detail::TensorInfo src, + cuda::detail::TensorInfo indices, int dstSelectDim, int srcSelectDim, IndexType totalSize, @@ -1287,7 +1285,7 @@ __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo dst } IndexType srcIndex = - indices.data[cuda::detail::IndexToOffset::get(dstIndex, indices)]; + indices.data[cuda::detail::IndexToOffset::get(dstIndex, indices)]; CUDA_KERNEL_ASSERT(srcIndex < srcSelectDimSize); IndexType dstOffset = @@ -1295,7 +1293,7 @@ __global__ void indexSelectLargeIndex(cuda::detail::TensorInfo dst dstOffset += dstIndex * dst.strides[dstSelectDim]; IndexType srcOffset = - cuda::detail::IndexToOffset::get(elementInSlice, src); + cuda::detail::IndexToOffset::get(elementInSlice, src); srcOffset += srcIndex * src.strides[srcSelectDim]; dst.data[dstOffset] = src.data[srcOffset]; @@ -1395,12 +1393,12 @@ void index_select_out_cuda_impl( int outSelectDim = outInfo.collapseDims(dim); outInfo.reduceDim(outSelectDim); - auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(self)); + auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(self)); int selfSelectDim = selfInfo.collapseDims(dim); selfInfo.reduceDim(selfSelectDim); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_cuda_impl", [&] () { - auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(index)); + auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(index)); indicesInfo.collapseDims(); // A reasonable choice for when to have each thread iterate over @@ -1442,11 +1440,11 @@ void index_select_out_cuda_impl( int outSelectDim = outInfo.collapseDims(dim); outInfo.reduceDim(outSelectDim); - auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(self)); + auto selfInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(self)); int selfSelectDim = selfInfo.collapseDims(dim); selfInfo.reduceDim(selfSelectDim); AT_DISPATCH_INDEX_TYPES(index.scalar_type(), "index_select_out_cuda_impl", [&] () { - auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(index)); + auto indicesInfo = tensorInfoLegacyIfScalar(cuda::detail::getTensorInfo(index)); indicesInfo.collapseDims(); LARGE_INDEX(scalar_t, index_t, uint64_t, -1, -1, -1, true); @@ -1576,8 +1574,8 @@ Tensor & masked_fill__cuda(Tensor& self, const Tensor & mask, const Scalar& valu .check_all_same_dtype(false) .resize_outputs(false) .add_output(self) - .add_input(self) - .add_input(*b_mask) + .add_const_input(self) + .add_const_input(*b_mask) .build(); masked_fill_kernel(iter, value); diff --git a/aten/src/ATen/native/cuda/Loops.cuh b/aten/src/ATen/native/cuda/Loops.cuh index fe38b1e17f24e..cb14f275e2171 100644 --- a/aten/src/ATen/native/cuda/Loops.cuh +++ b/aten/src/ATen/native/cuda/Loops.cuh @@ -68,17 +68,7 @@ __device__ inline void elementwise_kernel_helper(func_t f, policy_t policy) { }} // namespace at::native -// Note: -// CUDA and ROCm get diverged in this PR: -// https://github.com/pytorch/pytorch/pull/32383 -// Because for some reason trying to enable vectorized -// memory access introduce regression on ROCm. - -#if !defined(USE_ROCM) - #include -#else - #include -#endif +#include namespace at:: native { diff --git a/aten/src/ATen/native/cuda/Loss.cu b/aten/src/ATen/native/cuda/Loss.cu index 3f76f0931bfbc..1691adca87253 100644 --- a/aten/src/ATen/native/cuda/Loss.cu +++ b/aten/src/ATen/native/cuda/Loss.cu @@ -219,7 +219,7 @@ __global__ void nll_loss_forward_reduce_cuda_kernel_1d( *output = -cur_weight * input[t]; } } else { - // If the only element was omited, we get 0. See the discussion in + // If the only element was omitted, we get 0. See the discussion in // https://github.com/pytorch/pytorch/pull/64572#issuecomment-926504162 *output = scalar_t{0}; *total_weight = scalar_t{0}; @@ -408,7 +408,7 @@ template __global__ void nll_loss_backward_no_reduce_cuda_kernel( int batch_size, const index_t *target, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, PackedTensorAccessor64 grad_input, const scalar_t *weights, int64_t n_classes, @@ -520,7 +520,7 @@ void nll_loss_backward_out_cuda_template( at::cuda::getCurrentCUDAStream()>>>( batch_size, target.const_data_ptr(), - grad_output.packed_accessor64(), + grad_output.packed_accessor64(), grad_input.packed_accessor64(), weight.defined() ? weight_.const_data_ptr() : nullptr, n_classes, diff --git a/aten/src/ATen/native/cuda/LossCTC.cu b/aten/src/ATen/native/cuda/LossCTC.cu index 5fb86d16e95a9..b451592f19440 100644 --- a/aten/src/ATen/native/cuda/LossCTC.cu +++ b/aten/src/ATen/native/cuda/LossCTC.cu @@ -44,7 +44,7 @@ namespace { // so if l is l_0 l_1 ... l_(tl-1) then this looks up idx in // l' = BLANK l_0 BLANK l_1 BLANK ... BLANK l_(tl-1) BLANK // - note that no bound-checking is done -// - it is important to only call it witth idx == 0 if the target length is 0 +// - it is important to only call it with idx == 0 if the target length is 0 // - __restrict__ impact to be measured, see // https://devblogs.nvidia.com/cuda-pro-tip-optimize-pointer-aliasing/ template @@ -97,6 +97,14 @@ ctc_loss_log_alpha_gpu_kernel(scalar_t* __restrict__ log_alpha_data, if (b >= batch_size) return; + if (input_length == 0) { + if (threadIdx.x == 0) { + scalar_t log_likelihood = target_length == 0 ? 0 : neginf; + neg_log_likelihood_data[b] = -log_likelihood; + } + return; + } + // first row (t=0), the three equations for alpha_1 above eq (6) for (int64_t block_s = 0; block_s < 2*max_target_length+1; block_s += blockDim.x) { int64_t s = threadIdx.x + block_s; @@ -237,6 +245,9 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const if (targets.dim() == 1) { // concatenated targets int64_t pos = 0; for (int64_t i = 0; i < batch_size; i++) { + TORCH_CHECK(target_lengths[i] >= 0, + "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i], + " (while checking arguments for ", c, ")"); tg_batch_offsets_data[i] = pos; pos += target_lengths[i]; if (max_target_length < target_lengths[i]) @@ -249,6 +260,9 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const // dim is 2 int64_t tg_batch_stride = targets.stride(0); for (int64_t i = 0; i < batch_size; i++) { + TORCH_CHECK(target_lengths[i] >= 0, + "Expected target_lengths to have value at least ", 0, ", but got value ", target_lengths[i], + " (while checking arguments for ", c, ")"); tg_batch_offsets_data[i] = i * tg_batch_stride; if (max_target_length < target_lengths[i]) max_target_length = target_lengths[i]; @@ -261,6 +275,9 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const } int64_t max_input_length = log_probs.size(0); for (int64_t b = 0; b < batch_size; b++) { + TORCH_CHECK(input_lengths[b] >= 0, + "Expected input_lengths to have value at least ", 0, ", but got value ", input_lengths[b], + " (while checking arguments for ", c, ")"); TORCH_CHECK(input_lengths[b] <= max_input_length, "Expected input_lengths to have value at most ", max_input_length, ", but got value ", input_lengths[b], " (while checking arguments for ", c, ")"); @@ -273,8 +290,8 @@ std::tuple ctc_loss_gpu_template(const Tensor& log_probs, const Tensor log_alpha = at::empty({batch_size, log_probs.size(0), 2*max_target_length+1}, log_probs.options()); Tensor neg_log_likelihood = at::empty({batch_size}, log_probs.options()); - // Very likely, we could be more clever here, e.g. learning (or genralizing and reusing) from SoftMax.cu... - constexpr int max_threads = std::is_same::value ? 1024 : 896; // we need 72 or so 32 bit registers for double + // Very likely, we could be more clever here, e.g. learning (or generalizing and reusing) from SoftMax.cu... + constexpr int max_threads = std::is_same::value ? 1024 : 768; // we need 72 or so 32 bit registers for double int threads_target = max_threads; while (threads_target / 2 >= 2*max_target_length+1) { threads_target /= 2; @@ -322,7 +339,10 @@ ctc_loss_backward_log_beta_gpu_kernel(scalar_t* __restrict__ log_beta_data, if (b >= batch_size) return; - // "first" row, the beta initiaization before eq (10) (t=target_length - differes per batch) + if (input_length == 0) + return; + + // "first" row, the beta initialization before eq (10) (t=target_length - differes per batch) for (int64_t block_s = 2*max_target_length - (2*max_target_length % blockDim.x); block_s >= 0; block_s -= blockDim.x) { int64_t s = threadIdx.x + block_s; scalar_t lb; diff --git a/aten/src/ATen/native/cuda/MaxUnpooling.cu b/aten/src/ATen/native/cuda/MaxUnpooling.cu index 340162c649ece..b364d679fa3b1 100644 --- a/aten/src/ATen/native/cuda/MaxUnpooling.cu +++ b/aten/src/ATen/native/cuda/MaxUnpooling.cu @@ -51,8 +51,8 @@ __global__ void max_unpooling2d_forward_kernel( template __global__ void max_unpooling3d_forward_kernel( - PackedTensorAccessor64 input, - PackedTensorAccessor64 indices, + PackedTensorAccessor64 input, + PackedTensorAccessor64 indices, T* output, const int64_t oT, const int64_t oH, @@ -64,8 +64,8 @@ __global__ void max_unpooling3d_forward_kernel( int64_t slice = (blockIdx.z + offsetZ) / input.size(1); // input slice/feature int64_t outputImageSize = oT * oH * oW; if (iRow < input.size(2) && iColumn < input.size(3)) { - T val = input[slice][iFrame][iRow][iColumn]; - int64_t index = indices[slice][iFrame][iRow][iColumn]; + const T val = input[slice][iFrame][iRow][iColumn]; + const int64_t index = indices[slice][iFrame][iRow][iColumn]; CUDA_KERNEL_ASSERT(index >= 0 && index < outputImageSize); output[slice * oT * oH * oW + index] = val; } @@ -370,8 +370,8 @@ Tensor& max_unpooling3d_forward_out_cuda(const Tensor& self_, block, 0, at::cuda::getCurrentCUDAStream()>>>( - self.packed_accessor64(), - indices.packed_accessor64(), + self.packed_accessor64(), + indices.packed_accessor64(), output.mutable_data_ptr(), oT, oH, diff --git a/aten/src/ATen/native/cuda/MemoryAccess.cuh b/aten/src/ATen/native/cuda/MemoryAccess.cuh index 8f47a039a1e3f..0fdc813fd7770 100644 --- a/aten/src/ATen/native/cuda/MemoryAccess.cuh +++ b/aten/src/ATen/native/cuda/MemoryAccess.cuh @@ -109,7 +109,7 @@ struct LoadWithCast { size_array_t element_sizes; LoadWithCast(const TensorIteratorBase& iter) { - assert(iter.ninputs() == N); + CUDA_KERNEL_ASSERT(iter.ninputs() == N); #pragma unroll for (auto i = 0; i < N; ++i) { this->dtypes[i] = iter.dtype(i + iter.noutputs()); @@ -140,7 +140,7 @@ struct StoreWithCast { size_array_t element_sizes; StoreWithCast(const TensorIteratorBase& iter) { - assert(iter.noutputs() == N); + CUDA_KERNEL_ASSERT(iter.noutputs() == N); #pragma unroll for (auto i = 0; i < N; ++i) { this->dtypes[i] = iter.dtype(i); @@ -197,7 +197,7 @@ struct unroll { data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc), loader(l), storer(s) {} __device__ inline bool check_inbounds(int thread_work_elem) { - return ((threadIdx.x + thread_work_elem*num_threads()) < remaining); + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); } template @@ -219,7 +219,6 @@ struct unroll { template __device__ inline void store(scalar_t *from, int idx) { int thread_idx = threadIdx.x; - scalar_t *to = reinterpret_cast(data[0]) + block_work_size() * idx; #pragma unroll for (int i = 0; i < thread_work_size(); i++) { if (thread_idx >= remaining) { @@ -305,7 +304,7 @@ struct multi_outputs_unroll { data(data), remaining(remaining), input_offset_calculator(ic), output_offset_calculator(oc) {} __device__ inline bool check_inbounds(int thread_work_elem) { - return ((threadIdx.x + thread_work_elem*num_threads()) < remaining); + return ((int)(threadIdx.x + thread_work_elem*num_threads()) < remaining); } template @@ -347,7 +346,7 @@ struct multi_outputs_unroll { // which is C10_HOST_DEVICE, so we have to make this C10_HOST_DEVICE // in order to compile template -inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { +inline C10_HOST_DEVICE int can_vectorize_up_to(const char *pointer) { uint64_t address = reinterpret_cast(pointer); constexpr int vec2_alignment = std::alignment_of>::value; constexpr int vec4_alignment = std::alignment_of>::value; @@ -359,6 +358,11 @@ inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { return 1; } +template +inline C10_HOST_DEVICE int can_vectorize_up_to(char *pointer) { + return can_vectorize_up_to(static_cast(pointer)); +} + template struct can_vectorize_up_to_helper { template diff --git a/aten/src/ATen/native/cuda/NLLLoss2d.cu b/aten/src/ATen/native/cuda/NLLLoss2d.cu index 53d4238806b6e..94c9aeba79f51 100644 --- a/aten/src/ATen/native/cuda/NLLLoss2d.cu +++ b/aten/src/ATen/native/cuda/NLLLoss2d.cu @@ -56,6 +56,7 @@ __global__ void nll_loss2d_forward_no_reduce_kernel( int64_t ignore_index ) { int64_t batch_size = input.size(0); + int64_t n_classes = input.size(1); int64_t H = input.size(2); int64_t W = input.size(3); @@ -69,6 +70,7 @@ __global__ void nll_loss2d_forward_no_reduce_kernel( output[b][h][w] = static_cast(0); continue; } + CUDA_KERNEL_ASSERT(cur_target >= 0 && cur_target < n_classes); scalar_t value = input[b][cur_target][h][w]; scalar_t cur_weight = weight != nullptr ? weight[cur_target] : static_cast(1); output[b][h][w] = -value * cur_weight; diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu index c6c115dc640d8..247b1728badea 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose2d.cu @@ -219,7 +219,7 @@ void slow_conv_transpose2d_out_cuda_template( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per output: + // Matrix multiply per output: input_n = input_.select(0, elt); output_n = output.select(0, elt); @@ -419,7 +419,7 @@ static void slow_conv_transpose2d_backward_out_cuda_template( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per sample: + // Matrix multiply per sample: grad_input_n = grad_input.select(0, elt); grad_output_n = grad_output.select(0, elt); @@ -611,12 +611,12 @@ void slow_conv_transpose2d_acc_grad_parameters_cuda_template( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per output: + // Matrix multiply per output: grad_output_n = grad_output.select(0, elt); // Do Weight: if (grad_weight.defined()) { - // Matrix mulitply per output: + // Matrix multiply per output: input_n = input.select(0, elt); if (need_columns) { diff --git a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu index 1074769392b48..fd6e83aa24171 100644 --- a/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu +++ b/aten/src/ATen/native/cuda/NaiveConvolutionTranspose3d.cu @@ -301,7 +301,7 @@ void slow_conv_transpose3d_out_cuda_template( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per output: + // Matrix multiply per output: input_n = input.select(0, elt); output_n = output.select(0, elt); @@ -531,7 +531,7 @@ void slow_conv_transpose3d_backward_out_cuda_template( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per sample: + // Matrix multiply per sample: grad_input_n = grad_input.select(0, elt); grad_output_n = grad_output.select(0, elt); @@ -756,12 +756,12 @@ void slow_conv_transpose3d_acc_grad_parameters_cuda( // For each elt in batch, do: for (int elt = 0; elt < batch_size; elt++) { - // Matrix mulitply per output: + // Matrix multiply per output: grad_output_n = grad_output.select(0, elt); // Do Weight: if (grad_weight.defined()) { - // Matrix mulitply per output: + // Matrix multiply per output: input_n = input.select(0, elt); if (need_columns) { diff --git a/aten/src/ATen/native/cuda/Normalization.cu b/aten/src/ATen/native/cuda/Normalization.cu index f2104ee9d0459..ce0a50daae145 100644 --- a/aten/src/ATen/native/cuda/Normalization.cu +++ b/aten/src/ATen/native/cuda/Normalization.cu @@ -1,5 +1,7 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include +#include #include #include #include @@ -12,6 +14,8 @@ #include #include #else +#include +#include #include #include #include @@ -19,8 +23,12 @@ #include #include #include +#include +#include #include #include +#include +#include #include #include #include @@ -124,7 +132,7 @@ void batch_norm_elementwise( out, self, *weight, *bias, mean_, invstd_); return; } - C10_FALLTHROUGH; + [[fallthrough]]; } case Impl::General: { const int64_t ndim = self.dim(); @@ -193,7 +201,7 @@ Tensor batch_norm_elementwise_backward_train( return batch_norm_backward_elemt_channels_last_cuda_template( grad_out, input, mean, invstd, weight, sum_dy, sum_dy_xmu); } - C10_FALLTHROUGH; + [[fallthrough]]; } case Impl::General: { const auto ndim = input.dim(); @@ -259,9 +267,9 @@ Tensor batch_norm_elementwise_backward_eval( auto weight_nd = weight.as_strided(shape, strides); auto iter = TensorIteratorConfig() .add_output(grad_input) - .add_input(grad_out) - .add_input(invstd_nd) - .add_input(weight_nd) + .add_const_input(grad_out) + .add_const_input(invstd_nd) + .add_const_input(weight_nd) .check_all_same_dtype(false) .promote_inputs_to_common_dtype(false) .build(); @@ -277,8 +285,8 @@ Tensor batch_norm_elementwise_backward_eval( } else { auto iter = TensorIteratorConfig() .add_output(grad_input) - .add_input(grad_out) - .add_input(invstd_nd) + .add_const_input(grad_out) + .add_const_input(invstd_nd) .check_all_same_dtype(false) .promote_inputs_to_common_dtype(false) .build(); @@ -317,7 +325,7 @@ void batch_norm_mean_var(const Tensor& self, Tensor& save_mean, Tensor& save_var }); return; } - C10_FALLTHROUGH; + [[fallthrough]]; } case Impl::General: { const int64_t ndim = self.dim(); @@ -378,7 +386,7 @@ void batch_norm_update_stats_and_invert( .add_output(running_mean) .add_output(running_var) .add_output(save_var) - .add_input(save_mean) + .add_const_input(save_mean) .add_input(save_var) .add_input(running_mean) .add_input(running_var) @@ -473,6 +481,54 @@ std::tuple batch_norm_cuda(const Tensor& self, const c10 return std::make_tuple(output, save_mean, save_invstd); } +std::tuple _batch_norm_with_update_cuda( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();}); + Tensor output, save_mean, save_var, reserve; + + BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps); + if (backend == BatchNormBackend::Cudnn) { + std::tie(output, save_mean, save_var, reserve) = + at::cudnn_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps); + } else if (backend == BatchNormBackend::Miopen) { + reserve = at::empty({0}, input.options().dtype(kByte)); + std::tie(output, save_mean, save_var) = + at::miopen_batch_norm(input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps); + } else { + reserve = at::empty({0}, input.options().dtype(kByte)); + std::tie(output, save_mean, save_var) = + batch_norm_cuda(input, weight_opt, bias_opt, running_mean, running_var, /*training*/true, momentum, eps); + } + return std::tuple(output, save_mean, save_var, reserve); +} + +std::tuple _batch_norm_with_update_cuda_out( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps, + Tensor& out, Tensor& save_mean, Tensor& save_var, Tensor& reserve) { + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + const Tensor& bias = c10::value_or_else(bias_opt, [] {return Tensor();}); + + BatchNormBackend backend = _select_batch_norm_backend(input, weight, bias, running_mean, running_var, /*training*/true, eps); + if (backend == BatchNormBackend::Cudnn) { + std::tie(out, save_mean, save_var, reserve) = + at::cudnn_batch_norm_out(out, save_mean, save_var, reserve, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps); + } else if (backend == BatchNormBackend::Miopen) { + std::tie(out, save_mean, save_var) = + at::miopen_batch_norm_out(out, save_mean, save_var, input, weight, bias, running_mean, running_var, /*training*/true, momentum, eps); + } else { + std::tie(out, save_mean, save_var) = + batch_norm_cuda_out(input, weight_opt, bias_opt, running_mean, running_var, /*update*/true, momentum, eps, out, save_mean, save_var); + } + return std::tuple(out, save_mean, save_var, reserve); +} + std::tuple _batch_norm_legit_cuda(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, double momentum, double epsilon) { return batch_norm_cuda(self, weight_opt, bias_opt, running_mean, running_var, train, momentum, epsilon); } @@ -489,6 +545,28 @@ std::tuple _batch_norm_legit_no_stats_cuda_out(const return batch_norm_cuda_out(self, weight_opt, bias_opt, Tensor(), Tensor(), train, momentum, epsilon, output, save_mean, save_invstd); } +std::tuple _new_batch_norm_backward_cuda( + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { + const Tensor& dummy_bias = at::empty(1); + const Tensor& running_mean = c10::value_or_else(running_mean_opt, [] {return Tensor();}); + const Tensor& running_var = c10::value_or_else(running_var_opt, [] {return Tensor();}); + const Tensor& save_mean = c10::value_or_else(save_mean_opt, [] {return Tensor();}); + const Tensor& save_var = c10::value_or_else(save_var_opt, [] {return Tensor();}); + + BatchNormBackend backend = _select_batch_norm_backend(input, weight, dummy_bias, running_mean, running_var, /*training*/true, eps); + + if (backend == BatchNormBackend::Cudnn) { + return at::cudnn_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps, reserve); + } else if (backend == BatchNormBackend::Miopen) { + return at::miopen_batch_norm_backward(input, grad_output, weight, running_mean, running_var, save_mean, save_var, eps); + } else { + return batch_norm_backward_cuda(grad_output, input, weight, running_mean, running_var, save_mean, save_var, update, eps, grad_input_mask); + } +} + std::tuple batch_norm_backward_cuda(const Tensor& grad_out, const Tensor& input, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, double epsilon, std::array grad_input_mask) { // See [Note: hacky wrapper removal for optional tensor] c10::MaybeOwned weight = at::borrow_from_optional_tensor(weight_opt); @@ -499,7 +577,7 @@ std::tuple batch_norm_backward_cuda(const Tensor& grad_o const bool needs_reduction = train || grad_input_mask[1] || grad_input_mask[2]; - // Fused reducion & elementwise kernel + // Fused reduction & elementwise kernel if (needs_reduction && grad_input_mask[0] && !batch_norm_use_channels_last_kernels(input) && cuda::detail::canUse32BitIndexMath(input) && @@ -722,6 +800,8 @@ std::tuple batch_norm_update_stats_cuda( c10::MaybeOwned running_var = at::borrow_from_optional_tensor(running_var_opt); const int64_t n_input = self.size(1); + + TORCH_CHECK(self.numel() != 0, "input tensor must have at least one element, but got input_sizes = ", self.sizes()); auto options = self.options().dtype( at::toAccumulateType(self.scalar_type(), /*is_cuda=*/true)); auto save_mean = at::empty({n_input}, options); diff --git a/aten/src/ATen/native/cuda/Normalization.cuh b/aten/src/ATen/native/cuda/Normalization.cuh index ab2b316bc8a4b..2cd05518d726e 100644 --- a/aten/src/ATen/native/cuda/Normalization.cuh +++ b/aten/src/ATen/native/cuda/Normalization.cuh @@ -210,12 +210,12 @@ __device__ __forceinline__ void welford_merge_block_vertical(C& count, template __global__ void batch_norm_transform_input_kernel( - const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor input, GenericPackedTensorAccessor output, const GenericPackedTensorAccessor::type, 1, RestrictPtrTraits, index_t> mean_, const GenericPackedTensorAccessor::type, 1, RestrictPtrTraits, index_t> var_or_invstd, - const GenericPackedTensorAccessor weight, - const GenericPackedTensorAccessor bias, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor bias, stat_accscalar_t epsilon) { index_t plane = blockIdx.x; @@ -267,7 +267,7 @@ struct Var { template __global__ void batch_norm_collect_statistics_kernel( - const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor input, const stat_accscalar_t epsilon, const stat_accscalar_t momentum, GenericPackedTensorAccessor save_mean, @@ -354,16 +354,16 @@ __global__ void batch_norm_collect_statistics_kernel( template __global__ void batch_norm_backward_kernel( - const GenericPackedTensorAccessor input, - const GenericPackedTensorAccessor grad_output, + const GenericPackedTensorAccessor input, + const GenericPackedTensorAccessor grad_output, GenericPackedTensorAccessor grad_input, GenericPackedTensorAccessor grad_weight, GenericPackedTensorAccessor grad_bias, - const GenericPackedTensorAccessor weight, - const GenericPackedTensorAccessor running_mean, - const GenericPackedTensorAccessor running_var, - const GenericPackedTensorAccessor save_mean, - const GenericPackedTensorAccessor save_invstd, + const GenericPackedTensorAccessor weight, + const GenericPackedTensorAccessor running_mean, + const GenericPackedTensorAccessor running_var, + const GenericPackedTensorAccessor save_mean, + const GenericPackedTensorAccessor save_invstd, bool train, stat_accscalar_t epsilon) { @@ -385,7 +385,7 @@ __global__ void batch_norm_backward_kernel( // Compute two values across (batch, x/y/z) in one pass: // 1. Sum(grad_output) // 2. DotProduct(input - mean, grad_output) - GradOp> g(mean, input, grad_output); + GradOp> g(mean, input, grad_output); auto res = reduce>(g, grad_output, plane); stat_accscalar_t grad_output_sum = res.v1; @@ -582,7 +582,7 @@ __global__ void batch_norm_backward_elemt_kernel( template class PtrTraits = DefaultPtrTraits, typename index_t = int64_t> static GenericPackedTensorAccessor get_packed_accessor( const Tensor& t, c10::string_view var_name) { - constexpr auto expect_type = c10::CppTypeToScalarType::value; + constexpr auto expect_type = c10::CppTypeToScalarType::type>::value; const auto actual_type = t.scalar_type(); TORCH_CHECK(actual_type == expect_type, "Expected ", var_name, " to have type ", expect_type, " but got ", actual_type); @@ -624,25 +624,25 @@ std::tuple batch_norm_backward_cuda_template(const Tenso } auto input = get_packed_accessor< - input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); + const input_scalar_t, 3, DefaultPtrTraits, index_t>(input_reshaped, "input"); auto grad_output = get_packed_accessor< - input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); + const input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_output_reshaped, "grad_output"); auto grad_input = packed_accessor_or_dummy< input_scalar_t, 3, DefaultPtrTraits, index_t>(grad_input_reshaped, "grad_input"); auto weight = packed_accessor_or_dummy< - stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(weight_, "weight"); auto grad_weight = packed_accessor_or_dummy< stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_weight_, "grad_weight"); auto grad_bias = packed_accessor_or_dummy< stat_scalar_t, 1, DefaultPtrTraits, index_t>(grad_bias_, "grad_bias"); auto running_mean = packed_accessor_or_dummy< - stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean"); + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_mean_, "running_mean"); auto running_var = packed_accessor_or_dummy< - stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var"); + const stat_scalar_t, 1, DefaultPtrTraits, index_t>(running_var_, "running_var"); auto save_mean = packed_accessor_or_dummy< - accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean"); + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_mean_, "save_mean"); auto save_invstd = packed_accessor_or_dummy< - accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd"); + const accscalar_t, 1, DefaultPtrTraits, index_t>(save_invstd_, "save_invstd"); auto stream = at::cuda::getCurrentCUDAStream(); dim3 blocks(input.size(1)); @@ -670,7 +670,7 @@ void batch_norm_stats_cuda_template( resize_output(out_mean, {n_input}); resize_output(out_invstd, {n_input}); auto input = get_packed_accessor< - scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + const scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); TORCH_INTERNAL_ASSERT(out_invstd.dim() == 1 && out_invstd.is_contiguous() && out_invstd.sizes()[0]); TORCH_INTERNAL_ASSERT(out_mean.dim() == 1 && out_mean.is_contiguous() && @@ -700,13 +700,13 @@ void batch_norm_elemt_cuda_template(const Tensor& output_, const Tensor& input_, auto output_reshaped = output_.view({input_.size(0), input_.size(1), -1}); auto input = get_packed_accessor< - input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); + const input_scalar_t, 3, RestrictPtrTraits, index_t>(input_reshaped, "input"); auto output = get_packed_accessor< input_scalar_t, 3, RestrictPtrTraits, index_t>(output_reshaped, "output"); auto weight = packed_accessor_or_dummy< - stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight"); + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(weight_, "weight"); auto bias = packed_accessor_or_dummy< - stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias"); + const stat_scalar_t, 1, RestrictPtrTraits, index_t>(bias_, "bias"); auto mean = packed_accessor_or_dummy< stat_accscalar_t, 1, RestrictPtrTraits, index_t>(mean_, "mean"); auto invstd = packed_accessor_or_dummy< diff --git a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh index ac5cf934fab04..4553276bab684 100644 --- a/aten/src/ATen/native/cuda/PersistentSoftmax.cuh +++ b/aten/src/ATen/native/cuda/PersistentSoftmax.cuh @@ -316,7 +316,7 @@ void dispatch_softmax_forward(output_t *dst, const input_t *src, int softmax_ele // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_forward. int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; - // use 128 threads per block to maximimize gpu utilization + // use 128 threads per block to maximize gpu utilization constexpr int threads_per_block = 128; int warps_per_block = (threads_per_block / warp_size); @@ -366,7 +366,7 @@ void dispatch_softmax_backward(output_t *grad_input, const input_t *grad, const // This value must match the WARP_BATCH constexpr value computed inside softmax_warp_backward. int batches_per_warp = (next_power_of_two <= 128) ? 2 : 1; - // use 128 threads per block to maximimize gpu utilization + // use 128 threads per block to maximize gpu utilization constexpr int threads_per_block = 128; int warps_per_block = (threads_per_block / warp_size); diff --git a/aten/src/ATen/native/cuda/RNN.cu b/aten/src/ATen/native/cuda/RNN.cu index cf8887d9cc132..a997777fe0c3a 100644 --- a/aten/src/ATen/native/cuda/RNN.cu +++ b/aten/src/ATen/native/cuda/RNN.cu @@ -55,7 +55,7 @@ bool allContiguous(at::TensorList tensors) { } void getLaunchConfig(dim3* block, dim3* grid, int64_t numel) { - int curDevice = -1; + c10::DeviceIndex curDevice = -1; c10::cuda::GetDevice(&curDevice); *block = cuda::getApplyBlock(); TORCH_INTERNAL_ASSERT(cuda::getApplyGrid(numel, *grid, curDevice), diff --git a/aten/src/ATen/native/cuda/ROCmLoops.cuh b/aten/src/ATen/native/cuda/ROCmLoops.cuh deleted file mode 100644 index 75811d7ae6102..0000000000000 --- a/aten/src/ATen/native/cuda/ROCmLoops.cuh +++ /dev/null @@ -1,364 +0,0 @@ -#pragma once - -// This file provides two functions to help write GPU elementwise kernels: -// -// gpu_kernel(TensorIterator iter, ) -// gpu_kernel_with_scalars(TensorIterator iter, ) -// -// The gpu_kernel_with_scalars generates specializations that support a -// single scalar CPU argument, such as from `cuda_tensor + 5`. The CPU scalar -// is lifted to a kernel parameter instead of copying to device memory. -// This should be used in conjunction with TensorIterator::allow_cpu_scalars_, -// which is the default for TensorIterator::binary_op. Otherwise, all inputs -// and the output must be on the GPU. -// -// For example, to write a reciprocal kernel for GPU float Tensors: -// -// gpu_kernel(iter, []GPU_LAMBDA(float a) { -// return 1.0f / a; -// }); -// -// To write a multiplication kernel for GPU float Tensors where one argument -// may be a CPU scalar: -// -// gpu_kernel_with_scalars(iter, []GPU_LAMBDA(float a, float b) { -// return a * b; -// }); -// -// See BinaryOpsKernel.cu for the complete implementation -// - -#include - -#include -#include -#include -#include -#include -#include -#include -#include - - -#ifdef __NVCC__ -#define ASSERT_HOST_DEVICE_LAMBDA(type) \ - static_assert(__nv_is_extended_host_device_lambda_closure_type(type), \ - #type " must be a __host__ __device__ lambda") -#else -#define ASSERT_HOST_DEVICE_LAMBDA(type) -#endif - -static constexpr int launch_size_1d = 512; -static constexpr int launch_size_nd = 128; -static constexpr int launch_bound2 = 4; - - -namespace at { namespace native { - -// See [NOTE: Complex Operator Unification] -// std::complex and thrust::complex don't work with some !needs_dynamic_casting optimizations. -// They always currently map to !needs_dynamic_casting even though we sometimes rely on the ability -// to reinterpret_cast between these representations. -// In order to separate these concerns, we have a check for non-c10 complex separately. -template::arity> -struct uses_non_c10_complex { - constexpr static bool check() { - using traits = function_traits; - using type = typename traits::template arg::type; - constexpr bool non_c10_complex = - std::is_same, type>::value - || std::is_same, type>::value - || std::is_same, type>::value - || std::is_same, type>::value; - - if constexpr (non_c10_complex) { - return true; - } else { - return uses_non_c10_complex::check(); - } - } -}; - -template -struct uses_non_c10_complex { - constexpr static bool check() { - using traits = function_traits; - using type = typename traits::result_type; - constexpr bool non_c10_complex = - std::is_same, type>::value - || std::is_same, type>::value - || std::is_same, type>::value - || std::is_same, type>::value; - - return non_c10_complex; - } -}; - -// NOTE: @zasdfgbnm is currently working on rewriting the gpu loops. -// Some of the old codes has been moved to namespace legacy, and -// new codes will be put into namespace modern. These two namespaces -// will coexists for a while until the rewrite is done. Once the rewrite -// is done, we will remove the legacy and modern namespace and everything -// will be in at::native directly. -namespace legacy { - -template -C10_LAUNCH_BOUNDS_2(nt, launch_bound2) -__global__ void elementwise_kernel(int N, func_t f) { - int tid = threadIdx.x; - int nv = nt * vt; - int idx = nv * blockIdx.x + tid; - #pragma unroll - for (int i = 0; i < vt; i++) { - if (idx < N) { - f(idx); - idx += nt; - } - } -} - -template -static void launch_kernel(int64_t N, const func_t& f) { - TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits::max()); - if (N == 0) { - return; - } - dim3 block(nt); - dim3 grid((N + block.x * vt - 1) / (block.x * vt)); - auto stream = at::cuda::getCurrentCUDAStream(); - elementwise_kernel<<>>(N, f); - C10_CUDA_KERNEL_LAUNCH_CHECK(); -} - -template -C10_HOST_DEVICE typename traits::result_type -invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i, - std::index_sequence) { - return f(c10::load::type>(data[INDEX] + i * strides[INDEX])...); -} - -template > -C10_HOST_DEVICE typename traits::result_type -invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], int i) { - using Indices = std::make_index_sequence; - return invoke_impl(f, data, strides, i, Indices{}); -} - -template -C10_HOST_DEVICE typename traits::result_type -invoke_impl(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i, - std::index_sequence) { - return f(c10::fetch_and_cast::type>(dtypes[I], data[I] + i * strides[I])...); -} - -template > -C10_HOST_DEVICE typename traits::result_type -invoke(const func_t &f, char *const C10_RESTRICT data[], const index_t strides[], const ScalarType dtypes[], int i) { - using Indices = std::make_index_sequence; - return invoke_impl(f, data, strides, dtypes, i, Indices{}); -} - -} // namespace legacy - -// See the note for namespace legacy above. -namespace modern { - -namespace detail { - -template -__device__ inline constexpr decltype(auto) invoke_with_array_impl(func_t f, array_t t, std::index_sequence) -{ - return f(t[I]...); -} -template -__device__ inline constexpr decltype(auto) invoke_with_array(func_t f, array_t a) { - constexpr auto arity = function_traits::arity; - return invoke_with_array_impl(f, a, std::make_index_sequence{}); -} - -namespace arg_type { - -// We need a way to compute the argument type of a function. But -// for nullary function, it does not really have an argument type -// in this case, we still need to return a valid type, but we don't -// really care what type this is. - -struct dont_care {}; - -template -struct arg_type_helper { - using type = typename function_traits::template arg<0>::type; -}; - -template -struct arg_type_helper { - using type = dont_care; -}; - -template -using type = typename arg_type_helper::arity>::type; - -} // namespace arg_type - -template::arity-1> -struct has_same_arg_types { - using traits = function_traits; - static constexpr bool value = std::is_same< - typename traits::template arg::type, - typename traits::template arg::type - >::value && has_same_arg_types::value; -}; - -template -struct has_same_arg_types { - static constexpr bool value = true; -}; - -template -struct has_same_arg_types { - static constexpr bool value = true; -}; - -} // namespace detail - -template -C10_LAUNCH_BOUNDS_1(num_threads()) -__global__ void elementwise_kernel(int N, func_t f, array_t data) { - // Assumption: - // 1. all arguments of `f` have the same type, which could be different from the return type of `f` - // 2. all tensors are contiguous, that is: stride == sizeof(type) for all tensors - - using traits = function_traits; - using return_t = typename traits::result_type; - using arg_t = detail::arg_type::type; - constexpr int arity = traits::arity; - - // We need to create array to hold all the arguments, for nullary `f`, this means array of size 0. - // Unfortunately the compiler don't allow us to create array of 0 size, so for this case, we create - // an array of size 1 and just don't use it. - constexpr int nargs = traits::arity == 0 ? 1 : traits::arity; - - int tid = threadIdx.x; - int idx = block_work_size() * blockIdx.x + tid; - - // compute base pointers - return_t *result_base = reinterpret_cast(data[0]) + idx; - arg_t *args_base[nargs]; - #pragma unroll - for (int i = 0; i < arity; i++) { - args_base[i] = reinterpret_cast(data[i + 1]) + idx; - } - - // fetch data - return_t results[thread_work_size()]; - arg_t args[thread_work_size()][nargs]; - #pragma unroll - for (int i = 0; i < thread_work_size(); i++) { - if (idx + num_threads() * i < N) { - #pragma unroll - for (int j = 0; j < arity; j++) { - args[i][j] = c10::load(args_base[j] + i * num_threads()); - } - } - } - - // compute - #pragma unroll - for (int i = 0; i < thread_work_size(); i++) { - if (idx + num_threads() * i < N) { - results[i] = detail::invoke_with_array(f, args[i]); - } - } - - // store data - #pragma unroll - for (int i = 0; i < thread_work_size(); i++) { - if (idx + num_threads() * i < N) { - *(result_base + i * num_threads()) = results[i]; - } - } -} - -// TODO (@zasdfgbnm): this function assume trivial 1d and no dynamic casting -template::value, int> = 0> -static void launch_kernel(int64_t N, const func_t& f, array_t data) { - TORCH_INTERNAL_ASSERT(N >= 0 && N <= std::numeric_limits::max()); - if (N == 0) { - return; - } - int64_t grid = (N + block_work_size() - 1) / block_work_size(); - auto stream = at::cuda::getCurrentCUDAStream(); - elementwise_kernel<<>>(N, f, data); - C10_CUDA_KERNEL_LAUNCH_CHECK(); -} - -template::value, int> = 0> -static void launch_kernel(int64_t N, const func_t& f, array_t data) {} - -} // namespace modern - - -template -void gpu_kernel_impl(TensorIteratorBase& iter, const func_t& f) { - using traits = function_traits; - using arg0_t = typename traits::result_type; - constexpr int ntensors = traits::arity + 1; - - TORCH_INTERNAL_ASSERT(iter.can_use_32bit_indexing()); - TORCH_INTERNAL_ASSERT(iter.ntensors() == traits::arity + 1); - bool non_c10_complex = uses_non_c10_complex::check(); - - at::detail::Array data; - for (int i = 0; i < ntensors; i++) { - data[i] = (char*)iter.data_ptr(i); - } - - at::detail::Array dtypes; - for (int i = 0; i < ntensors; i++) { - dtypes[i] = iter.dtype(i); - } - - int64_t numel = iter.numel(); - if (iter.is_trivial_1d()) { - auto inner_strides = iter.get_inner_strides(); - at::detail::Array strides; - for (int i = 0; i < ntensors; i++) { - strides[i] = inner_strides[i]; - } - - // TODO: can non_c10_complex go through the other path? Need to verify. - if (needs_dynamic_casting::check(iter) || non_c10_complex) { - legacy::launch_kernel(numel, [=]GPU_LAMBDA(int idx) { - void* out = data[0] + strides[0] * idx; - arg0_t result = legacy::invoke(f, &data.data[1], &strides.data[1], &dtypes.data[1], idx); - c10::cast_and_store(dtypes[0], out, result); - }); - } else if (iter.has_contiguous_first_dim() && modern::detail::has_same_arg_types::value) { - modern::launch_kernel(numel, f, data); - } else { - legacy::launch_kernel(numel, [=]GPU_LAMBDA(int idx) { - arg0_t* out = (arg0_t*)(data[0] + strides[0] * idx); - *out = legacy::invoke(f, &data.data[1], &strides.data[1], idx); - }); - } - } else { - auto offset_calc = ::make_offset_calculator(iter); - // TODO: can non_c10_complex go through the other path? Need to verify. - if (needs_dynamic_casting::check(iter) || non_c10_complex) { - legacy::launch_kernel(numel, [=]GPU_LAMBDA(int idx) { - auto offsets = offset_calc.get(idx); - void* out = data[0] + offsets[0]; - arg0_t result = legacy::invoke(f, &data.data[1], &offsets.data[1], &dtypes.data[1], 1); - c10::cast_and_store(dtypes[0], out, result); - }); - } else { - legacy::launch_kernel(numel, [=]GPU_LAMBDA(int idx) { - auto offsets = offset_calc.get(idx); - arg0_t* out = (arg0_t*)(data[0] + offsets[0]); - *out = legacy::invoke(f, &data.data[1], &offsets.data[1], 1); - }); - } - } -} - -}} // namespace at::native diff --git a/aten/src/ATen/native/cuda/Reduce.cuh b/aten/src/ATen/native/cuda/Reduce.cuh index 0ccb01110e2a6..1f67ee3ea63e1 100644 --- a/aten/src/ATen/native/cuda/Reduce.cuh +++ b/aten/src/ATen/native/cuda/Reduce.cuh @@ -1054,7 +1054,7 @@ ReduceConfig setReduceConfig(const TensorIterator& iter){ // Case 1: "vectorize along input" // This case happens when we are reducing along fastest moving dimesion. In such case, threads // with the same threadIdx.y works on the same reduction cooperatively and will produce results - // for the same ouput. In such case, values in each loaded vector always correspond to the same ouput. + // for the same output. In such case, values in each loaded vector always correspond to the same output. // // Case 2: "vectorize along output" // This case happens when the fastest moving dimesion is not the dimension of reduction. In such case, diff --git a/aten/src/ATen/native/cuda/ReflectionPad.cu b/aten/src/ATen/native/cuda/ReflectionPad.cu index 3e576c896742e..6f0ba1fbb7905 100644 --- a/aten/src/ATen/native/cuda/ReflectionPad.cu +++ b/aten/src/ATen/native/cuda/ReflectionPad.cu @@ -160,10 +160,10 @@ __global__ void reflection_pad2d_backward_out_kernel( gpuAtomicAddNoReturn(&grad_input[index_pair.first], grad_output[index_pair.second]); } } -template +template __device__ inline void parallel_reflection_pad3d( - PackedTensorAccessor64 input, - PackedTensorAccessor64 output, + PackedTensorAccessor64 input, + PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, @@ -211,7 +211,7 @@ __device__ inline void parallel_reflection_pad3d( template __global__ void reflection_pad3d_out_kernel( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, int64_t pad_left, int64_t pad_top, int64_t pad_front, int64_t y_shift, int64_t z_shift @@ -241,7 +241,7 @@ __global__ void reflection_pad3d_out_kernel( template __global__ void reflection_pad3d_backward_out_kernel( PackedTensorAccessor64 grad_input, - PackedTensorAccessor64 grad_output, + PackedTensorAccessor64 grad_output, int64_t pad_left, int64_t pad_top, int64_t pad_front, int64_t y_shift, int64_t z_shift ) { @@ -595,7 +595,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_out_cuda) ( output_inner = output.unsqueeze(0); } - auto input_packed = input_inner.packed_accessor64(); + auto input_packed = input_inner.packed_accessor64(); auto output_packed = output_inner.packed_accessor64(); int64_t output_plane_size = output_packed.size(2) * output_packed.size(3) * output_packed.size(4); @@ -648,7 +648,7 @@ TORCH_IMPL_FUNC(reflection_pad3d_backward_out_cuda) ( } auto grad_input_packed = grad_input_.packed_accessor64(); - auto grad_output_packed = grad_output_.packed_accessor64(); + auto grad_output_packed = grad_output_.packed_accessor64(); int64_t output_plane_size = grad_output_packed.size(2) * grad_output_packed.size(3) * grad_output_packed.size(4); diff --git a/aten/src/ATen/native/cuda/Repeat.cu b/aten/src/ATen/native/cuda/Repeat.cu index 65c6863745c8c..0a39a0445dbe2 100644 --- a/aten/src/ATen/native/cuda/Repeat.cu +++ b/aten/src/ATen/native/cuda/Repeat.cu @@ -12,8 +12,8 @@ template __global__ static void compute_cuda_kernel( - index_t* repeat_ptr, - int64_t* cumsum_ptr, + const index_t* repeat_ptr, + const int64_t* cumsum_ptr, index_t* result_ptr, int64_t size, int64_t result_size) { @@ -35,8 +35,8 @@ __global__ static void compute_cuda_kernel( template static void compute_cuda( - index_t* repeat_ptr, - int64_t* cumsum_ptr, + const index_t* repeat_ptr, + const int64_t* cumsum_ptr, index_t* result_ptr, int64_t size, int64_t result_size) { diff --git a/aten/src/ATen/native/cuda/ReplicationPadding.cu b/aten/src/ATen/native/cuda/ReplicationPadding.cu index e65c0e90fe03d..d6517516e51ff 100644 --- a/aten/src/ATen/native/cuda/ReplicationPadding.cu +++ b/aten/src/ATen/native/cuda/ReplicationPadding.cu @@ -39,23 +39,23 @@ __host__ __device__ __forceinline__ int imax(int a, int b) { namespace { template __global__ void replication_pad_forward_kernel1d( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, const int padL, const int y_shift, const int z_shift) { - const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; - const int plane = blockIdx.y + y_shift; - const int batch = blockIdx.z + z_shift; + const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + const int64_t plane = blockIdx.y + y_shift; + const int64_t batch = blockIdx.z + z_shift; if (outputPointId >= output.size(2)) { return; } - const int outputPointX = outputPointId % output.size(2); + const auto outputPointX = outputPointId % output.size(2); const int iStartX = imax(0, -padL); const int oStartX = imax(0, padL); - const int inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX; + const auto inputPointX = imin(imax(padL, outputPointX), input.size(2) + padL - 1) - oStartX + iStartX; scalar_t valueToCopy = input[batch][plane][inputPointX]; output[batch][plane][outputPointX] = valueToCopy; @@ -64,22 +64,22 @@ __global__ void replication_pad_forward_kernel1d( template __global__ void replication_pad_backward_kernel( PackedTensorAccessor64 gradInput, - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, const int padL, const int y_shift, const int z_shift) { - const int outputPointId = threadIdx.x + blockIdx.x * blockDim.x; - const int plane = blockIdx.y + y_shift; - const int batch = blockIdx.z + z_shift; + const int64_t outputPointId = threadIdx.x + blockIdx.x * blockDim.x; + const int64_t plane = blockIdx.y + y_shift; + const int64_t batch = blockIdx.z + z_shift; if (outputPointId >= gradOutput.size(2)) { return; } - const int outputPointX = outputPointId % gradOutput.size(2); + const auto outputPointX = outputPointId % gradOutput.size(2); const int iStartX = imax(0, -padL); const int oStartX = imax(0, padL); - const int inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX; + const auto inputPointX = imin(imax(padL, outputPointX), gradInput.size(2) + padL - 1) - oStartX + iStartX; scalar_t valueToCopy = gradOutput[batch][plane][outputPointX]; gpuAtomicAddNoReturn(&gradInput[batch][plane][inputPointX], valueToCopy); @@ -87,7 +87,7 @@ __global__ void replication_pad_backward_kernel( template __global__ void replication_pad_forward_kernel2d( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, const int padT, const int padL, @@ -117,7 +117,7 @@ __global__ void replication_pad_forward_kernel2d( template __global__ void replication_pad_backward_kernel( PackedTensorAccessor64 gradInput, - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, const int padT, const int padL, const int y_shift, @@ -145,7 +145,7 @@ __global__ void replication_pad_backward_kernel( template __global__ void replication_pad_forward_kernel3d( - PackedTensorAccessor64 input, + PackedTensorAccessor64 input, PackedTensorAccessor64 output, const int pfront, const int ptop, @@ -185,7 +185,7 @@ __global__ void replication_pad_forward_kernel3d( template __global__ void replication_pad_backward_kernel( PackedTensorAccessor64 gradInput, - PackedTensorAccessor64 gradOutput, + PackedTensorAccessor64 gradOutput, const int pfront, const int ptop, const int pleft, @@ -278,7 +278,7 @@ void replication_pad2d_backward_out_cuda_template( gradOutput_ = gradOutput.unsqueeze(0); } auto devGradInput = gradInput_.packed_accessor64(); - auto devGradOutput = gradOutput_.packed_accessor64(); + auto devGradOutput = gradOutput_.packed_accessor64(); int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3); int64_t size1 = devGradOutput.size(1); @@ -392,7 +392,7 @@ void replication_pad3d_backward_out_cuda_template( gradOutput_ = gradOutput.unsqueeze(0); } auto devGradInput = gradInput_.packed_accessor64(); - auto devGradOutput = gradOutput_.packed_accessor64(); + auto devGradOutput = gradOutput_.packed_accessor64(); const int64_t outputPlaneSize = devGradOutput.size(2) * devGradOutput.size(3) * devGradOutput.size(4); const int64_t size1 = devGradOutput.size(1); @@ -419,8 +419,8 @@ void replication_pad3d_backward_out_cuda_template( TORCH_IMPL_FUNC(replication_pad1d_out_cuda) ( const Tensor& input, IntArrayRef paddingSize, const Tensor& output ) { - TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), - "input tensor must fit into 32-bit index math"); + TORCH_CHECK(input.numel() < std::numeric_limits::max(), + "replication_pad1d only supports input tensors with less than 2^63 - 1 elements"); int64_t padL = paddingSize[0]; int64_t padR = paddingSize[1]; @@ -446,7 +446,7 @@ TORCH_IMPL_FUNC(replication_pad1d_out_cuda) ( output_ = output.unsqueeze(0); } - auto devInput = input_.packed_accessor64(); + auto devInput = input_.packed_accessor64(); auto devOutput = output_.packed_accessor64(); int64_t outputPlaneSize = devOutput.size(2); @@ -480,19 +480,19 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cuda) ( // Nondeterministic because of atomicAdd usage globalContext().alertNotDeterministic("replication_pad1d_backward_cuda"); - TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(input), - "input tensor must fit into 32-bit index math"); - TORCH_CHECK(at::cuda::detail::canUse32BitIndexMath(gradOutput), - "output gradient tensor must fit into 32-bit index math"); + TORCH_CHECK(input.numel() < std::numeric_limits::max(), + "replication_pad1d only supports input tensors with less than 2^63 - 1 elements"); + TORCH_CHECK(gradOutput.numel() < std::numeric_limits::max(), + "replication_pad1d only supports output tensors with less than 2^63 - 1 elements"); - const int padL = paddingSize[0]; - int dimw = 1; + const int64_t padL = paddingSize[0]; + int64_t dimw = 1; - int numInputDims = input.ndimension(); + int64_t numInputDims = input.ndimension(); if (numInputDims == 3) { dimw++; } - int iwidth = input.size(dimw); + int64_t iwidth = input.size(dimw); if (gradInput.numel() == 0) { return; @@ -509,7 +509,7 @@ TORCH_IMPL_FUNC(replication_pad1d_backward_out_cuda) ( gradOutput_ = gradOutput.unsqueeze(0); } auto devGradInput = gradInput_.packed_accessor64(); - auto devGradOutput = gradOutput_.packed_accessor64(); + auto devGradOutput = gradOutput_.packed_accessor64(); int64_t outputPlaneSize = devGradOutput.size(2); int64_t size1 = devGradOutput.size(1); @@ -551,7 +551,7 @@ TORCH_IMPL_FUNC(replication_pad2d_out_cuda) ( input_ = input.unsqueeze(0); output_ = output.unsqueeze(0); } - auto devInput = input_.packed_accessor64(); + auto devInput = input_.packed_accessor64(); auto devOutput = output_.packed_accessor64(); int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3); int64_t size1 = devOutput.size(1); @@ -644,7 +644,7 @@ TORCH_IMPL_FUNC(replication_pad3d_out_cuda) ( output_ = output.unsqueeze(0); } - auto devInput = input_.packed_accessor64(); + auto devInput = input_.packed_accessor64(); auto devOutput = output_.packed_accessor64(); const int64_t outputPlaneSize = devOutput.size(2) * devOutput.size(3) * devOutput.size(4); diff --git a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu index 5509d854a34dd..9ef83599cd15c 100644 --- a/aten/src/ATen/native/cuda/ScatterGatherKernel.cu +++ b/aten/src/ATen/native/cuda/ScatterGatherKernel.cu @@ -82,7 +82,7 @@ static TensorAssign tensor_assign; // of the same size. template struct alignas(N) OpaqueType { char data[N]; }; -// essentialy rewritten related to legacy::launch_kernel parts +// essentially rewritten related to legacy::launch_kernel parts template C10_LAUNCH_BOUNDS_2(nt, vt) __global__ void _scatter_gather_elementwise_kernel(int N, func_t f) { @@ -188,8 +188,8 @@ struct cuda_scatter_gather_base_kernel { .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(src_restrided) - .add_input(index) + .add_const_input(src_restrided) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(self, dim); @@ -246,8 +246,8 @@ struct cuda_scatter_gather_base_kernel { .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(src_restrided) - .add_input(index) + .add_const_input(src_restrided) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(self, dim); @@ -305,8 +305,8 @@ struct cuda_scatter_gather_base_kernel { .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(src_restrided) - .add_input(index) + .add_const_input(src_restrided) + .add_const_input(index) .build(); auto self_dim_stride = ensure_nonempty_stride(self, dim); @@ -401,7 +401,7 @@ struct cuda_scatter_fill_base_kernel { .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(index) + .add_const_input(index) .build(); auto index_size = ensure_nonempty_size(self, dim); @@ -444,7 +444,7 @@ struct cuda_scatter_fill_base_kernel { .check_all_same_dtype(false) .resize_outputs(false) .add_output(self_restrided) - .add_input(index) + .add_const_input(index) .build(); auto index_size = ensure_nonempty_size(self, dim); diff --git a/aten/src/ATen/native/cuda/Shape.cu b/aten/src/ATen/native/cuda/Shape.cu index 1680a43e014f4..99fea30540210 100644 --- a/aten/src/ATen/native/cuda/Shape.cu +++ b/aten/src/ATen/native/cuda/Shape.cu @@ -51,6 +51,28 @@ inline bool getCatGrid(ptrdiff_t nTensors, dim3& grid) { return true; } +template +inline std::tuple getCatGridRocm(unsigned int max_elements_per_tensor, + ptrdiff_t nTensors) { + constexpr unsigned int threads_per_block = 256; + constexpr unsigned int elements_per_thread = 8; + constexpr unsigned int max_tb_per_sm = 32; + + unsigned int max_threads = ceil_div(max_elements_per_tensor, elements_per_thread); + unsigned int thread_blocks = ceil_div(max_threads, threads_per_block); + + // Limit the number of thread blocks to prevent too many threads to load the metadata + // if they operate on very small tensors. + + const unsigned int num_sm = at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + thread_blocks = std::min(num_sm * max_tb_per_sm, thread_blocks); + + dim3 block = dim3(threads_per_block); + dim3 grid = dim3(thread_blocks, (long long)nTensors); + + return std::make_tuple(grid, block); +} + template inline std::tuple getCatGridContig(unsigned int max_elements_per_tensor, ptrdiff_t nTensors) { @@ -176,6 +198,34 @@ __global__ void CatArrayBatchedCopy( } } +template +__global__ void CatArrayBatchedCopy_contig( + T* output, + CatArrInputTensorMetadata inputs, + TensorSizeStride os, + const int concatDim, + IndexType dimStride) { + + IndexType tid = blockIdx.x * blockDim.x + threadIdx.x; + IndexType nElements = inputs.nElements[blockIdx.y]; + + if(tid >= nElements) return; + + const T* data = inputs.input[blockIdx.y]; + IndexType offset = inputs.offset[blockIdx.y]; + IndexType dimSize = inputs.dimSize[blockIdx.y]; + IndexType dataOffset = offset * dimStride; + + IndexType stride = gridDim.x * blockDim.x; + + while( tid < nElements){ + IndexType elementOffset = CatArrIndexToOffset::compute( + os.tensorSize, os.tensorStride, dimSize, concatDim, tid); + output[dataOffset + elementOffset] = data[tid]; + tid += stride; + } +} + /* Specialized implementation of the CatArrayBatchedCopy written to generate wide memory loads to improve memory bandwidth throughput. @@ -295,9 +345,14 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i catMetaData.dimSize[batchCounter] = dimSize; catMetaData.nElements[batchCounter] = inputs[i+batchCounter].get().numel(); +#ifdef USE_ROCM + // On ROCm, CatArrayBatchedCopy_contig is faster + isAligned = false; +#else // If at least one of the inputs is not aligned, we can't call the // CatArrayBatchedCopy_aligned16_contig isAligned &= is_aligned_vec4(catMetaData.input[batchCounter]); +#endif if (stride_size > 1) { auto strides = inputs[i+batchCounter].get().strides(); @@ -326,6 +381,15 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i dim3 applyBlock, catGrid; +#ifdef USE_ROCM + // always base grid size on max_elements_per_tensor + { + std::tuple launchParams = getCatGridRocm( + max_elements_per_tensor, batchCounter); + catGrid = std::get<0>(launchParams); + applyBlock = std::get<1>(launchParams); + } +#else if (isContig && sizeof(scalar_t) > 2) { std::tuple launchParams = getCatGridContig( max_elements_per_tensor, batchCounter); @@ -335,6 +399,7 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i applyBlock = dim3(32 * 16); getCatGrid(batchCounter, catGrid); } +#endif if (memory_format != c10::MemoryFormat::Contiguous) { switch (dimension) { @@ -353,6 +418,10 @@ void parallel_cat(const Tensor &out, const MaterializedITensorListRef& inputs, i CatArrayBatchedCopy_aligned16_contig<<<\ catGrid, applyBlock, 0, stream.stream()>>>(\ data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\ + } else if (isContig) {\ + CatArrayBatchedCopy_contig<<<\ + catGrid, applyBlock, 0, stream.stream()>>>(\ + data, catMetaData, outputParam, dimension, outputParam.tensorStride[dimension]);\ } else {\ CatArrayBatchedCopy<<<\ catGrid, applyBlock, 0, stream.stream()>>>(\ diff --git a/aten/src/ATen/native/cuda/SoftMax.cu b/aten/src/ATen/native/cuda/SoftMax.cu index b57d778d7cb58..cffd52624f9e3 100644 --- a/aten/src/ATen/native/cuda/SoftMax.cu +++ b/aten/src/ATen/native/cuda/SoftMax.cu @@ -15,6 +15,7 @@ #include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -172,11 +173,39 @@ inline dim3 SoftMax_getBlockSize(int ILP, uint64_t dim_size) { return dim3(block_size); } +inline dim3 SoftMaxForward_getBlockSize(uint64_t dim_size) { + uint64_t block_size = 1; + uint64_t max_block_size = std::min(dim_size, static_cast(max_threads)); + + // We need a block size that is a multiple of C10_WARP_SIZE in order + // to perform block size reductions using warp shuffle instructions. + // Since max_threads is also a multiple of C10_WARPS_SIZE we do not + // risk creating a block size larger than the limit. + + if (max_block_size % C10_WARP_SIZE == 0) { + block_size = max_block_size; + } else { + block_size = (max_block_size / C10_WARP_SIZE + 1) * C10_WARP_SIZE; + } + + return dim3(block_size); +} + template struct Add { __device__ __forceinline__ T operator()(T a, T b) const { return a + b; } + + __device__ __forceinline__ T combine(T a, T b) const { + return a + b; + } + + // Needed to allow warp level reduction as a first step in the + // thread block reduction + __device__ __forceinline__ T warp_shfl_down(T data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } }; template @@ -184,6 +213,16 @@ struct Max { __device__ __forceinline__ T operator()(T a, T b) const { return a < b ? b : a; } + + __device__ __forceinline__ T combine(T a, T b) const { + return a < b ? b : a; + } + + // Needed to allow warp level reduction as a first step in the + // thread block reduction + __device__ __forceinline__ T warp_shfl_down(T data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } }; // Note that it's not a complete block-wide reduction. @@ -396,6 +435,20 @@ blockReduce(AccumT* smem, AccumT val, return smem[0]; } +// Performs a thread block reduction with a given functor but uses +// warp shuffles as the first step in the reduction +template class Reduction, typename T> +__device__ __forceinline__ +T blockReduceWarp(T* smem_cache, T value, const Reduction& op, T defaultVal) +{ + T result = cuda_utils::BlockReduce>(value, op, defaultVal, smem_cache); + if (threadIdx.x == 0) { + smem_cache[0] = result; + } + __syncthreads(); + return smem_cache[0]; +} + template class Reduction, int ILP, typename T, typename AccumT, typename index_t=int> __device__ __forceinline__ AccumT ilpReduce(index_t shift, @@ -556,7 +609,7 @@ WriteBpropResultsVectorized( } /** - * This will apply the Epilogue with non-vectrorized reads & writes for the general case + * This will apply the Epilogue with non-vectorized reads & writes for the general case */ template class Epilogue> __device__ __forceinline__ void @@ -565,26 +618,7 @@ WriteFpropResults( const scalar_t *input, outscalar_t *output, Epilogue epilogue) { - int offset = threadIdx.x; - - int last = classes % (ILP * blockDim.x); - - // Main bulk of loop with ILP - for (; offset < classes - last; offset += blockDim.x * ILP) { - scalar_t tmp[ILP]; - - #pragma unroll - for (int j = 0; j < ILP; ++j) { - tmp[j] = input[offset + j * blockDim.x]; - } - #pragma unroll - for (int j = 0; j < ILP; ++j) { - output[offset + j * blockDim.x] = epilogue(tmp[j]); - } - } - - // Remainder - no ILP - for (; offset < classes; offset += blockDim.x) { + for (int offset = threadIdx.x; offset < classes; offset += blockDim.x) { output[offset] = epilogue(input[offset]); } } @@ -631,9 +665,6 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes) extern __shared__ unsigned char smem[]; auto sdata = reinterpret_cast(smem); - using LoadT = at::native::memory::aligned_vector; - using StoreT = at::native::memory::aligned_vector; - // forward pointers to batch[blockIdx.x] // each block handles a sample in the mini-batch input += static_cast(blockIdx.x) * classes; @@ -644,15 +675,15 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes) // find the max accscalar_t threadMax = ilpReduce( - shift, input, classes, MaxFloat(), -at::numeric_limits::max()); - accscalar_t max_k = blockReduce( - sdata, threadMax, Max(), -at::numeric_limits::max()); + shift, input, classes, MaxFloat(), -at::numeric_limits::max()); + accscalar_t max_k = blockReduceWarp(sdata, threadMax, + Max(), -at::numeric_limits::max()); // reduce all values accscalar_t threadExp = ilpReduce( - shift, input, classes, SumExpFloat(max_k), static_cast(0)); - accscalar_t sumAll = blockReduce( - sdata, threadExp, Add(), static_cast(0)); + shift, input, classes, SumExpFloat(max_k), static_cast(0)); + accscalar_t sumAll = blockReduceWarp(sdata, threadExp, + Add(), static_cast(0)); Epilogue epilogue(max_k, sumAll); @@ -663,6 +694,78 @@ cunn_SoftMaxForward(outscalar_t *output, const scalar_t *input, int classes) } } +template class Epilogue, typename index_t = int32_t> +__global__ void +cunn_SoftMaxForwardSmem(outscalar_t *output, const scalar_t *input, index_t classes) +{ + // Each thread block processes a sample in the batch + input += static_cast(blockIdx.x) * classes; + output += static_cast(blockIdx.x) * classes; + + accscalar_t threadMax = -at::numeric_limits::max(); + accscalar_t threadExp = static_cast(0); + + // The first smem segment is used to cache input values and the last + // segment is used for thread block reductions + extern __shared__ unsigned char smem[]; + auto smem_input_cache = reinterpret_cast(smem); + auto smem_reduction_cache = reinterpret_cast(smem + + classes * sizeof(scalar_t)); + + using LoadT = at::native::memory::aligned_vector; + const LoadT* const input_vec_ptr = reinterpret_cast(input); + LoadT* const smem_input_cache_vec_ptr = reinterpret_cast(smem_input_cache); + + // Download inputs to shared memory while doing the first step + // in max calculation + MaxFloat maxFunc; + for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) { + LoadT crnt_vec = input_vec_ptr[offset]; + smem_input_cache_vec_ptr[offset] = crnt_vec; + + #pragma unroll + for (int i = 0; i < ILP; ++i) { + threadMax = maxFunc(threadMax, crnt_vec.val[i]); + } + } + + accscalar_t max_k = blockReduceWarp(smem_reduction_cache, threadMax, + Max(), -at::numeric_limits::max()); + + // Reload input from shared memory to compute the sum. The previous + // reduce has performed a __syncthreads() so the smem contents are populated. + SumExpFloat sumExpFunc(max_k); + for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) { + LoadT crnt_vec = smem_input_cache_vec_ptr[offset]; + + #pragma unroll + for (int i = 0; i < ILP; ++i) { + threadExp = sumExpFunc(threadExp, crnt_vec.val[i]); + } + } + + accscalar_t sumAll = blockReduceWarp(smem_reduction_cache, threadExp, + Add(), static_cast(0)); + + Epilogue epilogue(max_k, sumAll); + + // Use vectorized stores to save the output + using StoreT = at::native::memory::aligned_vector; + StoreT* output_vec_ptr = reinterpret_cast(output); + for (index_t offset = threadIdx.x; offset * ILP < classes; offset += blockDim.x) { + LoadT crnt_vec = smem_input_cache_vec_ptr[offset]; + StoreT out_vec; + + #pragma unroll + for (int i = 0; i < ILP; ++i) { + out_vec.val[i] = epilogue(crnt_vec.val[i]); + } + + output_vec_ptr[offset] = out_vec; + } +} + C10_DEVICE bool inline is_32bit_representable(const int64_t value) { return value < static_cast(std::numeric_limits::max()); } @@ -741,9 +844,9 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t AT_DISPATCH_FLOATING_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "host_softmax", [&] { using accscalar_t = acc_type; if (!half_to_float) { + auto output_ptr = output.mutable_data_ptr(); + auto input_ptr = input.const_data_ptr(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { - auto output_ptr = output.mutable_data_ptr(); - auto input_ptr = input.const_data_ptr(); int64_t remaining = outer_size; int64_t chunk_size = (1L << 30L) / dim_size; while(remaining > 0) { @@ -755,16 +858,31 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t } } else { constexpr int ILP = sizeof(float4) / sizeof(scalar_t); - dim3 block = SoftMax_getBlockSize(ILP, dim_size); - cunn_SoftMaxForward - <<>>( - output.mutable_data_ptr(), input.const_data_ptr(), dim_size); + dim3 block = SoftMaxForward_getBlockSize(dim_size); + size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - + smem_reduction_sz) / sizeof(scalar_t); + + bool can_use_smem = dim_size < max_elements_per_smem; + can_use_smem &= !(reinterpret_cast(input_ptr) % ALIGN_BYTES); + can_use_smem &= (!(reinterpret_cast(output_ptr) % ALIGN_BYTES)); + can_use_smem &= !(dim_size % ILP); + + if (can_use_smem) { + size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; + cunn_SoftMaxForwardSmem + <<>>(output_ptr, input_ptr, dim_size); + } else { + cunn_SoftMaxForward + <<>>(output_ptr, input_ptr, dim_size); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } else { + auto output_ptr = output.mutable_data_ptr(); + auto input_ptr = input.const_data_ptr(); if (dim_size <= 1024 && dim_size*sizeof(scalar_t) <= 4096) { - auto output_ptr = output.mutable_data_ptr(); - auto input_ptr = input.const_data_ptr(); int64_t remaining = outer_size; int64_t chunk_size = (1<<30) / dim_size; while(remaining > 0) { @@ -775,11 +893,26 @@ Tensor host_softmax(const Tensor & input_, const int64_t dim_, const bool half_t remaining -= chunk_size; } } else { - constexpr int ILP = sizeof(float4) / sizeof(accscalar_t); - dim3 block = SoftMax_getBlockSize(ILP, dim_size); - cunn_SoftMaxForward - <<>>( - output.mutable_data_ptr(), input.const_data_ptr(), dim_size); + constexpr int ILP = sizeof(float4) / sizeof(scalar_t); + dim3 block = SoftMaxForward_getBlockSize(dim_size); + size_t smem_reduction_sz = block.x / C10_WARP_SIZE * sizeof(accscalar_t); + auto max_elements_per_smem = (at::cuda::getCurrentDeviceProperties()->sharedMemPerBlock - + smem_reduction_sz) / sizeof(scalar_t); + + bool can_use_smem = dim_size < max_elements_per_smem; + can_use_smem &= !(reinterpret_cast(input_ptr) % ALIGN_BYTES); + can_use_smem &= (!(reinterpret_cast(output_ptr) % ALIGN_BYTES)); + can_use_smem &= !(dim_size % ILP); + + if (can_use_smem) { + size_t smem_sz = dim_size * sizeof(scalar_t) + smem_reduction_sz; + cunn_SoftMaxForwardSmem + <<>>(output_ptr, input_ptr, dim_size); + } else { + cunn_SoftMaxForward + <<>>(output_ptr, input_ptr, dim_size); + } + C10_CUDA_KERNEL_LAUNCH_CHECK(); } } diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu index 313c6d1ea981b..6272bbb9b75df 100644 --- a/aten/src/ATen/native/cuda/Sorting.cu +++ b/aten/src/ATen/native/cuda/Sorting.cu @@ -22,7 +22,7 @@ namespace { // Finds the rank k element, and its index, of the values along dimension dim template __global__ void gatherKthValue( - cuda::detail::TensorInfo input, + cuda::detail::TensorInfo input, index_t inputSliceSize, index_t k, index_t numInputSlices, @@ -40,13 +40,13 @@ __global__ void gatherKthValue( // Find the start offset for our slice index_t sliceStartIndex = - cuda::detail::IndexToOffset::get(slice, input); + cuda::detail::IndexToOffset::get(slice, input); index_t kthValueSliceStartIndex = cuda::detail::IndexToOffset::get(slice, kthValue); index_t indicesSliceStartIndex = cuda::detail::IndexToOffset::get(slice, indices); - scalar_t* inputSliceStart = &input.data[sliceStartIndex]; + const scalar_t* inputSliceStart = &input.data[sliceStartIndex]; scalar_t* kthValueSliceStart = &kthValue.data[kthValueSliceStartIndex]; int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; @@ -92,7 +92,7 @@ template __global__ void gatherMedian( cuda::detail::TensorInfo values, cuda::detail::TensorInfo indices, - cuda::detail::TensorInfo input, + cuda::detail::TensorInfo input, index_t inputSliceSize, index_t numInputSlices, index_t inputWithinSliceStride, @@ -112,11 +112,11 @@ __global__ void gatherMedian( index_t indicesSliceStartIndex = cuda::detail::IndexToOffset::get(slice, indices); index_t inputSliceStartIndex = - cuda::detail::IndexToOffset::get(slice, input); + cuda::detail::IndexToOffset::get(slice, input); scalar_t* valuesSliceStart = &values.data[valuesSliceStartIndex]; int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; - scalar_t* inputSliceStart = &input.data[inputSliceStartIndex]; + const scalar_t* inputSliceStart = &input.data[inputSliceStartIndex]; index_t nan_count = 0; for (index_t i = threadIdx.x; i < inputSliceSize; i += blockDim.x) { @@ -178,7 +178,7 @@ struct KthValueLauncher { int collapse_values_dim, cuda::detail::TensorInfo indices_info, int collapse_indices_dim, - cuda::detail::TensorInfo self_info, + cuda::detail::TensorInfo self_info, int collapse_self_dim, int64_t num_slices, int64_t slice_size) { @@ -216,7 +216,7 @@ struct MedianLauncher { int collapse_values_dim, cuda::detail::TensorInfo indices_info, int collapse_indices_dim, - cuda::detail::TensorInfo self_info, + cuda::detail::TensorInfo self_info, int collapse_self_dim, int64_t num_slices, int64_t slice_size) { @@ -247,8 +247,8 @@ struct MedianLauncher { void launch_kthvalue_kernel( const TensorBase &values, const TensorBase &indices, const TensorBase &self, int64_t dim, int64_t k) { - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Half, self.scalar_type(), "kthvalue_cuda", [&] { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "kthvalue_cuda", [&] { AT_DISPATCH_INDEX_TYPES( cuda::detail::canUse32BitIndexMath(self) && cuda::detail::canUse32BitIndexMath(values) && @@ -263,8 +263,8 @@ void launch_kthvalue_kernel( void launch_median_kernel( const TensorBase &vals, const TensorBase &inds, const TensorBase &self, int64_t dim, bool ignore_nan) { - AT_DISPATCH_ALL_TYPES_AND( - at::ScalarType::Half, self.scalar_type(), "median_out_impl", [&] { + AT_DISPATCH_ALL_TYPES_AND2( + at::ScalarType::Half, at::ScalarType::BFloat16, self.scalar_type(), "median_out_impl", [&] { if (cuda::detail::canUse32BitIndexMath(vals) && cuda::detail::canUse32BitIndexMath(inds) && cuda::detail::canUse32BitIndexMath(self)) { diff --git a/aten/src/ATen/native/cuda/SortingCommon.cuh b/aten/src/ATen/native/cuda/SortingCommon.cuh index 4f151c407fea7..c4a8ec6864a1d 100644 --- a/aten/src/ATen/native/cuda/SortingCommon.cuh +++ b/aten/src/ATen/native/cuda/SortingCommon.cuh @@ -116,7 +116,7 @@ void run_launcher( const TensorBase &self, int64_t dim, Launcher l) { - auto self_info = cuda::detail::getTensorInfo(self); + auto self_info = cuda::detail::getTensorInfo(self); auto values_info = cuda::detail::getTensorInfo(values); auto indices_info = cuda::detail::getTensorInfo(indices); diff --git a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh index 238479c545234..1aeaca19652a6 100644 --- a/aten/src/ATen/native/cuda/SortingRadixSelect.cuh +++ b/aten/src/ATen/native/cuda/SortingRadixSelect.cuh @@ -186,7 +186,7 @@ __device__ void countRadixUsingMask( int radixDigitPos, index_t sliceSize, index_t withinSliceStride, - scalar_t* data) { + const scalar_t* data) { // Clear out per-thread counts from a previous round #pragma unroll for (int i = 0; i < RadixSize; ++i) { @@ -256,7 +256,7 @@ constexpr int RADIX_MASK = (RADIX_SIZE - 1); template __device__ scalar_t findPattern( scalar_t* smem, - scalar_t* data, + const scalar_t* data, index_t sliceSize, index_t withinSliceStride, bitwise_t desired, @@ -304,7 +304,7 @@ __device__ scalar_t findPattern( // Returns the top-Kth element found in the data using radix selection template __device__ void radixSelect( - scalar_t* data, + const scalar_t* data, index_t k, bool largest, index_t sliceSize, diff --git a/aten/src/ATen/native/cuda/SpectralOps.cpp b/aten/src/ATen/native/cuda/SpectralOps.cpp index 6a0c05a9e5424..1032fb28d799c 100644 --- a/aten/src/ATen/native/cuda/SpectralOps.cpp +++ b/aten/src/ATen/native/cuda/SpectralOps.cpp @@ -38,52 +38,8 @@ using namespace at::native::detail; static void exec_cufft_plan( const CuFFTConfig &config, void* in_data, void* out_data, bool forward) { auto& plan = config.plan(); -#if defined(USE_ROCM) - auto value_type = config.data_type(); - if (value_type == kFloat) { - switch (config.transform_type()) { - case CuFFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecC2C(plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case CuFFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecR2C(plan, static_cast(in_data), - static_cast(out_data))); - return; - } - case CuFFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecC2R(plan, static_cast(in_data), - static_cast(out_data))); - return; - } - } - } else if (value_type == kDouble) { - switch (config.transform_type()) { - case CuFFTTransformType::C2C: { - CUFFT_CHECK(hipfftExecZ2Z(plan, static_cast(in_data), - static_cast(out_data), - forward ? HIPFFT_FORWARD : HIPFFT_BACKWARD)); - return; - } - case CuFFTTransformType::R2C: { - CUFFT_CHECK(hipfftExecD2Z(plan, static_cast(in_data), - static_cast(out_data))); - return; - } - case CuFFTTransformType::C2R: { - CUFFT_CHECK(hipfftExecZ2D(plan, static_cast(in_data), - static_cast(out_data))); - return; - } - } - } - TORCH_CHECK(false, "hipFFT doesn't support transforms on type: ", value_type); -#else CUFFT_CHECK(cufftXtExec(plan, in_data, out_data, forward ? CUFFT_FORWARD : CUFFT_INVERSE)); -#endif } @@ -315,7 +271,7 @@ static const Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_ at::globalContext().getNVRTC().cuCtxSetCurrent(pctx); } #endif /* !defined(USE_ROCM) */ - exec_cufft_plan(*config, input.data_ptr(), out.data_ptr(), forward); + exec_cufft_plan(*config, const_cast(input.const_data_ptr()), out.data_ptr(), forward); // Inplace reshaping to original batch shape and inverting the dimension permutation DimVector out_strides(ndim); @@ -387,7 +343,7 @@ Tensor _fft_r2c_cufft(const Tensor& self, IntArrayRef dim, int64_t normalization // CuFFT requires real input to be over-aligned, as if it were complex const auto complex_size = 2 * self.element_size(); const bool complex_aligned = ( - reinterpret_cast(self.data_ptr()) % complex_size == 0); + reinterpret_cast(self.const_data_ptr()) % complex_size == 0); auto working_tensor = self; if (!complex_aligned) { working_tensor = self.movedim(last_dim, -1) diff --git a/aten/src/ATen/native/cuda/SummaryOps.cu b/aten/src/ATen/native/cuda/SummaryOps.cu index a35e5e274b7e2..f2626ccff4db7 100644 --- a/aten/src/ATen/native/cuda/SummaryOps.cu +++ b/aten/src/ATen/native/cuda/SummaryOps.cu @@ -65,7 +65,7 @@ C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize()) __global__ void kernelHistogram1D( detail::TensorInfo a, /* output */ detail::TensorInfo p, /* partial output */ - detail::TensorInfo b, /* input */ + detail::TensorInfo b, /* input */ int64_t nbins, at::acc_type minvalue, at::acc_type maxvalue, @@ -86,7 +86,7 @@ __global__ void kernelHistogram1D( FOR_KERNEL_LOOP(linearIndex, totalElements) { // Convert `linearIndex` into an offset of `b` const IndexType bOffset = - detail::IndexToOffset::get(linearIndex, b); + detail::IndexToOffset::get(linearIndex, b); const auto bVal = b.data[bOffset]; if (bVal >= minvalue && bVal <= maxvalue) { // Use value at `b` as an offset of `smem` @@ -112,7 +112,7 @@ __global__ void kernelHistogram1D( FOR_KERNEL_LOOP(linearIndex, totalElements) { // Convert `linearIndex` into an offset of `b` const IndexType bOffset = - detail::IndexToOffset::get(linearIndex, b); + detail::IndexToOffset::get(linearIndex, b); const auto bVal = b.data[bOffset]; if (bVal >= minvalue && bVal <= maxvalue) { // Use value at `b` as an offset of `a` @@ -192,7 +192,7 @@ bool CUDA_tensor_histogram( const dim3 block = getApplyBlock(); dim3 grid; - int64_t curDevice = current_device(); + auto curDevice = current_device(); if (curDevice == -1 || !getApplyGrid(totalElements, grid, curDevice)) { return false; } @@ -219,7 +219,7 @@ bool CUDA_tensor_histogram( using IndexType = int64_t; auto aInfo = detail::getTensorInfo(a); - auto bInfo = detail::getTensorInfo(b); + auto bInfo = detail::getTensorInfo(b); detail::TensorInfo pInfo(nullptr, 0, {}, {}); if (HasWeights) { diff --git a/aten/src/ATen/native/cuda/TensorShape.cu b/aten/src/ATen/native/cuda/TensorShape.cu new file mode 100644 index 0000000000000..d82901ef94529 --- /dev/null +++ b/aten/src/ATen/native/cuda/TensorShape.cu @@ -0,0 +1,841 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS +#include +#include +#include +#include +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +namespace at::native { + +namespace detail { + +// NOTE [CUDA fast path for split_with_sizes_copy.out] +// split_with_sizes_copy.out for contiguous operands has the following +// properties: +// - Each src split consists of multiple chunks that are separated by a fixed +// stride. The number of chunks and the strides are the same across all src +// splits. +// - Each dst split is the concatenation of the chunks in its corresponding src +// splits. +// - The sizes of chunks vary across splits. +// - A (src, dst) chunk pair is not guaranteed to have the +// same alignment. +// +// The following strategies are employed to optimize for this workload: +// - The entire workload is fused into a single kernel to maximize I/O +// throughput and minimize wave quantization. +// - To account for both small and large chunk sizes, a "jagged grid" is used. +// Each chunk is processed by one or more blocks depending on its size. +// - Within each chunk, the region in which writes can be vectorized is +// identified. Within this region, writes are always vectorized and reads are +// oppurtunistically vectorized. +static constexpr int64_t BLOCK_SIZE = 128; +static constexpr int64_t BYTES_PER_THREAD = 16; +static constexpr int64_t BYTES_PER_BLOCK = BYTES_PER_THREAD * BLOCK_SIZE; + +static __host__ __device__ inline int64_t div_up(int64_t a, int64_t b) { + return (a + b - 1) / b; +} + +template +__device__ inline void stream_load128(uint4& val, const T* addr) { + uint64_t low, high; +#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) + low = reinterpret_cast(addr)[0]; + high = reinterpret_cast(addr)[1]; +#else + asm("ld.global.nc.v2.u64 {%0, %1}, [%2];" + : "=l"(low), "=l"(high) + : "l"(addr)); +#endif + reinterpret_cast(&val)[0] = low; + reinterpret_cast(&val)[1] = high; +} + +template +__device__ inline void stream_store128(T* addr, const uint4& val) { + uint64_t low, high; + low = reinterpret_cast(&val)[0]; + high = reinterpret_cast(&val)[1]; +#if defined(USE_ROCM) || (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 800)) + reinterpret_cast(addr)[0] = low; + reinterpret_cast(addr)[1] = high; +#else + asm("st.global.cs.v2.u64 [%0], {%1, %2};" : : "l"(addr), "l"(low), "l"(high)); +#endif +} + +template +static __device__ inline bool is_aligned(const void* addr) { + return reinterpret_cast(addr) % sizeof(T) == 0; +} + +template +static __device__ inline void load128(uint4& val, const char* addr) { + for (size_t i = 0; i < detail::BYTES_PER_THREAD / sizeof(T); ++i) { + reinterpret_cast(&val)[i] = reinterpret_cast(addr)[i]; + } +} + +template <> +__device__ inline void load128(uint4& val, const char* addr) { + stream_load128(val, addr); +} + +static __device__ inline void load128(uint4& val, const char* addr) { + if (is_aligned(addr)) { + load128(val, addr); + } else if (is_aligned(addr)) { + load128(val, addr); + } else if (is_aligned(addr)) { + load128(val, addr); + } else { + load128(val, addr); + } +} + +static __device__ __inline__ void get_aligned_region( + char* ptr, + const int64_t chunk_size, + const int64_t alignment, + int64_t& align_off, + int64_t& aligned_size) { + const int64_t ptr_val = reinterpret_cast(ptr); + align_off = detail::div_up(ptr_val, alignment) * alignment - ptr_val; + aligned_size = (chunk_size - align_off) / alignment * alignment; +} + +static __device__ __inline__ void copy_chunk( + char* dst, + const char* src, + int64_t chunk_size, + int64_t thread_idx, + int64_t num_threads) { + if (chunk_size < num_threads) { + if (thread_idx < chunk_size) { + dst[thread_idx] = src[thread_idx]; + } + return; + } + + // Identify the region in which writes are guaranteed to be 128-bit aligned + int64_t align_off, aligned_size; + get_aligned_region( + dst, chunk_size, detail::BYTES_PER_THREAD, align_off, aligned_size); + + for (int64_t off = align_off + thread_idx * detail::BYTES_PER_THREAD; + off < align_off + aligned_size; + off += num_threads * detail::BYTES_PER_THREAD) { + uint4 val; + // Oppurtunistically vectorize reads + load128(val, &src[off]); + stream_store128(&dst[off], val); + } + + // Handle unaligned regions + if (thread_idx < align_off && thread_idx < chunk_size) { + dst[thread_idx] = src[thread_idx]; + } + if (align_off + aligned_size + thread_idx < chunk_size) { + dst[align_off + aligned_size + thread_idx] = + src[align_off + aligned_size + thread_idx]; + } +} + +static __global__ void split_with_sizes_copy_out_contiguous_no_cast_kernel( + char** dst_base_addrs, + char** src_base_addrs, + int64_t* split_chunk_sizes, + int64_t* block_idx_to_split_idx, + int64_t* blocks_cumsums, + int64_t src_stride, + int64_t num_chunks) { + const int64_t split_idx = block_idx_to_split_idx[blockIdx.x]; + const int64_t split_blocks = + blocks_cumsums[split_idx + 1] - blocks_cumsums[split_idx]; + const int64_t split_threads = split_blocks * blockDim.x; + const int64_t split_thread_idx = + (blockIdx.x - blocks_cumsums[split_idx]) * blockDim.x + threadIdx.x; + const int64_t split_chunk_size = split_chunk_sizes[split_idx]; + + char* dst_base_addr = dst_base_addrs[split_idx]; + char* src_base_addr = src_base_addrs[split_idx]; + + for (int64_t i = blockIdx.y; i < num_chunks; i += gridDim.y) { + copy_chunk( + dst_base_addr + i * split_chunk_size, + src_base_addr + i * src_stride, + split_chunk_size, + split_thread_idx, + split_threads); + } +} + +// Calculate the base addr for each split. +static inline std::vector get_split_base_addrs( + const at::Tensor& tensor, + at::IntArrayRef split_sizes, + int64_t dim) { + const auto* data_ptr = static_cast(tensor.const_data_ptr()); + const auto strides = tensor.strides(); + const auto element_sz = tensor.element_size(); + int64_t off = 0; + std::vector split_base_addrs; + split_base_addrs.reserve(split_sizes.size()); + for (const auto& split_size : split_sizes) { + split_base_addrs.push_back(reinterpret_cast(data_ptr + off)); + off += split_size * strides[dim] * element_sz; + } + return split_base_addrs; +} + +static inline std::vector get_dst_addrs(at::TensorList out) { + std::vector addrs; + addrs.reserve(out.size()); + for (const auto& tensor : out) { + addrs.push_back(reinterpret_cast(tensor.data_ptr())); + } + return addrs; +} + +// Calculate the chunk size for each split in bytes. +static inline std::vector get_split_chunk_sizes( + const at::Tensor& tensor, + at::IntArrayRef split_sizes, + int64_t dim) { + const auto stride = tensor.stride(dim); + const auto element_sz = tensor.element_size(); + std::vector split_chunk_sizes; + split_chunk_sizes.reserve(split_sizes.size()); + for (const auto& split_size : split_sizes) { + split_chunk_sizes.push_back(split_size * stride * element_sz); + } + return split_chunk_sizes; +} + +// Calculate the chunk stride in bytes. This is the same for all splits. +static inline int64_t get_chunk_stride(const at::Tensor& tensor, int64_t dim) { + int64_t stride = 1; + for (int64_t d = dim; d < tensor.dim(); ++d) { + stride *= tensor.sizes()[d]; + } + return stride * tensor.element_size(); +} + +// Calculate the number of chunks. This is the same for all splits. +static inline int64_t get_num_chunks(const at::Tensor& tensor, int64_t dim) { + int64_t num_chunks = tensor.numel(); + for (int64_t d = dim; d < tensor.dim(); ++d) { + num_chunks /= tensor.sizes()[d]; + } + return num_chunks; +} + +// Pack multiple std::vector into a single cuda tensor. +std::pair> pack_vecs( + std::vector*> vecs, + const at::Device& device) { + int64_t numel = 0; + for (const auto* vec : vecs) { + numel += vec->size(); + } + + auto packed = at::empty( + {numel}, at::TensorOptions().dtype(at::kLong).pinned_memory(true)); + size_t offset = 0; + for (const auto* vec : vecs) { + memcpy( + packed.data_ptr() + offset, + vec->data(), + sizeof(int64_t) * vec->size()); + offset += vec->size(); + } + packed = packed.to(device, /*non_blocking=*/true); + + std::vector ptrs; + ptrs.reserve(vecs.size()); + offset = 0; + for (const auto* vec : vecs) { + ptrs.push_back(packed.data_ptr() + offset); + offset += vec->size(); + } + return std::make_pair(std::move(packed), std::move(ptrs)); +} + +static inline std::vector get_chunk_cat_out_sizes( + IntArrayRef input_tensor_sizes, + int64_t dim, + int64_t num_chunks, + int64_t chunk_size, + int64_t out_element_size) { + std::vector view_sizes = std::vector( + input_tensor_sizes.begin(), input_tensor_sizes.begin() + dim); + view_sizes.insert( + view_sizes.end(), {num_chunks, chunk_size / out_element_size}); + return view_sizes; +} + +// Copy `max_chunk_size` bytes from `src` to `dst` by `num_threads`, and pad +// zero when `src` size (i.e., actual_chunk_size) is less than `max_chunk_size`. +// Assume elements of src and dst have the same data type. +template +__device__ __inline__ void copy_chunk_with_pad( + dst_t* dst_ptr, + src_t* src_ptr, + int64_t max_chunk_size, + int64_t actual_chunk_size, + int64_t thread_idx, + int64_t num_threads) { + // Supports type cast + if (!std::is_same_v) { + const int64_t max_num_elems = max_chunk_size / sizeof(dst_t); + const int64_t actual_num_elems = actual_chunk_size / sizeof(src_t); + int64_t elem_index = thread_idx; + while (elem_index < actual_num_elems) { + dst_ptr[elem_index] = + static_cast_with_inter_type::apply(src_ptr[elem_index]); + elem_index += num_threads; + } + while (elem_index < max_num_elems) { + dst_ptr[elem_index] = static_cast_with_inter_type::apply(0); + elem_index += num_threads; + } + return; + } + char* dst = reinterpret_cast(dst_ptr); + char* src = reinterpret_cast(src_ptr); + // Fast path when the number of threads is larger than the number of bytes to + // be copied (i.e., max_chunk_size). In this case, each thread only copies 1 + // byte. For 0 <= thread_idx < actual_chunk_size, the thread copies data from + // `src`. For actual_chunk_size <= thread_idx < max_chunk_size, the thread set + // the val=0 for padding. + if (max_chunk_size < num_threads) { + char val = static_cast(0); + if (thread_idx < actual_chunk_size) { + val = src[thread_idx]; + } + if (thread_idx < max_chunk_size) { + dst[thread_idx] = val; + } + return; + } + // Split dst array into three parts: + // [dst, dst+align_off), [dst+align_off, dst+align_end), [dst+align_end, + // dst+max_chunk_size) The second part is aligned with BYTES_PER_THREAD(=16 + // bytes) to enable `stream_store128`. + int64_t align_off, aligned_size; + get_aligned_region( + dst, actual_chunk_size, BYTES_PER_THREAD, align_off, aligned_size); + int64_t align_end = align_off + aligned_size; + for (int64_t i = align_off + thread_idx * BYTES_PER_THREAD; i < align_end; + i += num_threads * BYTES_PER_THREAD) { + uint4 val; + if (is_aligned(src + i)) { + stream_load128(val, src + i); + } else { + for (size_t j = 0; j < BYTES_PER_THREAD; ++j) { + reinterpret_cast(&val)[j] = src[i + j]; + } + } + stream_store128(&dst[i], val); + } + // Copy data for the first part of dst array [dst, dst+align_off). + // Check `thread_idx +static __global__ void chunk_cat_cuda_kernel( + src_t** src, + dst_t* dst, + int64_t* block_idx_to_tensor_idx, + int64_t* tensor_idx_to_start_tensor_bytes, + int64_t* start_block_idx_per_tensor_chunk, + int64_t* actual_tensor_sizes, + int64_t* pad_tensor_chunk_sizes, + int64_t* num_blocks_per_tensor_chunk, + int64_t slice_size, + int64_t chunk_size, + int64_t dst_to_src_ratio) { + const int64_t slice_idx = blockIdx.z; + const int64_t chunk_idx = blockIdx.y; + const int64_t tensor_idx = block_idx_to_tensor_idx[blockIdx.x]; + const int64_t tile_idx = + blockIdx.x - start_block_idx_per_tensor_chunk[tensor_idx]; + // Number of threads for the `tensor_idx`-th tensor chunk. + const int64_t num_threads = + num_blocks_per_tensor_chunk[tensor_idx] * BLOCK_SIZE; + const int64_t thread_idx = tile_idx * BLOCK_SIZE + threadIdx.x; + char* src_addr = reinterpret_cast(src)[tensor_idx] + + slice_idx * actual_tensor_sizes[tensor_idx] + + chunk_idx * pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio; + char* dst_addr = reinterpret_cast(dst) + slice_idx * slice_size + + chunk_idx * chunk_size + tensor_idx_to_start_tensor_bytes[tensor_idx]; + // Compute the actual number of bytes to copy from src. + const int64_t actual_copy_size = std::min( + pad_tensor_chunk_sizes[tensor_idx] / dst_to_src_ratio, + std::max( + (int64_t)0, + actual_tensor_sizes[tensor_idx] - + chunk_idx * pad_tensor_chunk_sizes[tensor_idx] / + dst_to_src_ratio)); + copy_chunk_with_pad( + reinterpret_cast(dst_addr), + reinterpret_cast(src_addr), + pad_tensor_chunk_sizes[tensor_idx], + actual_copy_size, + thread_idx, + num_threads); +} + +bool all_contiguous(TensorList tensors) { + bool contiguous = true; + for (const auto& t : tensors) { + contiguous &= t.is_non_overlapping_and_dense(); + } + return contiguous; +} + +// Get leading dimensions before `dim`-th dimension. +static inline int64_t get_leading_dim(at::IntArrayRef sizes, int64_t dim) { + int64_t leading_dim = 1; + if (dim > 0) { + leading_dim = c10::multiply_integers(sizes.slice(0, dim)); + } + return leading_dim; +} + +// Get trailing dimensions after `dim`-th dimension and padded size along +// `dim`-th dimension. +static inline std::pair get_pad_size( + at::IntArrayRef sizes, + int64_t dim, + int64_t num_chunks) { + int64_t trailing_numel = 1; + if (sizes.size() > (uint64_t)dim + 1) { + trailing_numel = + c10::multiply_integers(sizes.slice(dim + 1, sizes.size() - dim - 1)); + } + int64_t pad_size_along_dim = + detail::div_up(sizes[dim], num_chunks) * num_chunks; + return std::make_pair(pad_size_along_dim, trailing_numel); +} + +// Get the padded chunk size. +static inline int64_t get_chunk_size( + TensorList tensors, + int64_t dim, + int64_t num_chunks, + int64_t elem_size) { + auto num_tensors = tensors.size(); + int64_t chunk_size = 0; + for (const auto i : c10::irange(num_tensors)) { + auto [pad_size_along_dim, trailing_numel] = + get_pad_size(tensors[i].sizes(), dim, num_chunks); + const int64_t pad_tensor_chunk_size = + pad_size_along_dim * trailing_numel * elem_size / num_chunks; + chunk_size += pad_tensor_chunk_size; + } + return chunk_size; +} + +// Get metadata for chunk_cat. +std::tuple< + int64_t, + int64_t, + int64_t, + int64_t, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector, + std::vector> +get_chunk_cat_metadata( + TensorList tensors, + int64_t dim, + int64_t num_chunks, + int64_t dst_elem_size, + int64_t src_elem_size) { + TORCH_CHECK( + dst_elem_size % src_elem_size == 0, + "get_chunk_cat_metadata error: only support dst_elem_size % src_elem_size == 0"); + auto num_tensors = tensors.size(); + int64_t leading_dim = get_leading_dim(tensors[0].sizes(), dim); + std::vector pad_tensor_chunk_sizes; + std::vector num_blocks_per_tensor_chunk; + std::vector start_block_idx_per_tensor_chunk{0}; + std::vector actual_tensor_sizes; + std::vector tensor_idx_to_start_tensor_bytes{0}; + std::vector srcs; + pad_tensor_chunk_sizes.reserve(num_tensors); + num_blocks_per_tensor_chunk.reserve(num_tensors); + start_block_idx_per_tensor_chunk.reserve(num_tensors + 1); + actual_tensor_sizes.reserve(num_tensors); + tensor_idx_to_start_tensor_bytes.reserve(num_tensors + 1); + srcs.reserve(num_tensors); + // block_idx_to_tensor_idx cannot be reserved since the number of blocks is + // data dependent + std::vector block_idx_to_tensor_idx; + // Inline computing `chunk_size` to avoid redundant computation + int64_t chunk_size = 0; + for (const auto i : c10::irange(num_tensors)) { + at::Tensor tensor = tensors[i]; + srcs.push_back(reinterpret_cast(tensor.data_ptr())); + auto sizes = tensor.sizes(); + auto [pad_size_along_dim, trailing_numel] = + get_pad_size(sizes, dim, num_chunks); + const int64_t pad_tensor_chunk_size = + pad_size_along_dim * trailing_numel * dst_elem_size / num_chunks; + pad_tensor_chunk_sizes.push_back(pad_tensor_chunk_size); + chunk_size += pad_tensor_chunk_size; + // Number of blocks required to process this tensor chunk. + const int64_t num_blocks = + detail::div_up(pad_tensor_chunk_size, detail::BYTES_PER_BLOCK); + num_blocks_per_tensor_chunk.push_back(num_blocks); + start_block_idx_per_tensor_chunk.push_back( + start_block_idx_per_tensor_chunk.back() + num_blocks); + block_idx_to_tensor_idx.insert( + block_idx_to_tensor_idx.end(), num_blocks, i); + tensor_idx_to_start_tensor_bytes.push_back( + tensor_idx_to_start_tensor_bytes.back() + pad_tensor_chunk_size); + actual_tensor_sizes.push_back(sizes[dim] * trailing_numel * src_elem_size); + } + const int64_t num_blocks_per_chunk = start_block_idx_per_tensor_chunk.back(); + const int64_t slice_size = num_chunks * chunk_size; + return std::make_tuple( + chunk_size, + leading_dim, + num_blocks_per_chunk, + slice_size, + srcs, + block_idx_to_tensor_idx, + tensor_idx_to_start_tensor_bytes, + start_block_idx_per_tensor_chunk, + actual_tensor_sizes, + pad_tensor_chunk_sizes, + num_blocks_per_tensor_chunk); +} + +// See [CUDA kernel for chunk_cat_cuda] +template +void _chunk_cat_out_cuda_contiguous( + TensorList tensors, + int64_t dim, + int64_t num_chunks, + Tensor& out, + int64_t dst_elem_size, + int64_t src_elem_size) { + const auto device = tensors[0].device(); + // `get_chunk_cat_metadata` must return vectors and `pack_vecs` cannot be + // moved into `get_chunk_cat_metadata`. Otherwise `packed` would point to + // vectors allocated inside `get_chunk_cat_metadata` which become out of local + // scope. + auto + [chunk_size, + leading_dim, + num_blocks_per_chunk, + slice_size, + srcs, + block_idx_to_tensor_idx, + tensor_idx_to_start_tensor_bytes, + start_block_idx_per_tensor_chunk, + actual_tensor_sizes, + pad_tensor_chunk_sizes, + num_blocks_per_tensor_chunk] = + get_chunk_cat_metadata( + tensors, dim, num_chunks, dst_elem_size, src_elem_size); + auto packed = pack_vecs( + {&srcs, + &block_idx_to_tensor_idx, + &tensor_idx_to_start_tensor_bytes, + &start_block_idx_per_tensor_chunk, + &actual_tensor_sizes, + &pad_tensor_chunk_sizes, + &num_blocks_per_tensor_chunk}, + device); + std::vector view_sizes = get_chunk_cat_out_sizes( + tensors[0].sizes(), dim, num_chunks, chunk_size, dst_elem_size); + at::native::resize_output(out, view_sizes); + dim3 blocks(num_blocks_per_chunk, num_chunks, leading_dim); + dim3 threads(detail::BLOCK_SIZE, 1, 1); + detail::chunk_cat_cuda_kernel<<< + blocks, + threads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + /*srcs=*/reinterpret_cast(packed.second[0]), + reinterpret_cast(out.data_ptr()), + /*block_idx_to_tensor_idx=*/packed.second[1], + /*tensor_idx_to_start_tensor_bytes=*/packed.second[2], + /*start_block_idx_per_tensor_chunk=*/packed.second[3], + /*actual_tensor_sizes=*/packed.second[4], + /*pad_tensor_chunk_sizes=*/packed.second[5], + /*num_blocks_per_tensor_chunk=*/packed.second[6], + slice_size, + chunk_size, + dst_elem_size / src_elem_size); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +} // namespace detail + +// See [CUDA fast path for split_with_sizes_copy.out] +void split_with_sizes_copy_out_cuda_contiguous_no_cast( + const at::Tensor& self, + at::IntArrayRef split_sizes, + int64_t dim, + at::TensorList out) { + const auto device = self.device(); + const auto src_base_addrs = + detail::get_split_base_addrs(self, split_sizes, dim); + const auto dst_base_addrs = detail::get_dst_addrs(out); + const auto src_stride = detail::get_chunk_stride(self, dim); + const auto split_chunk_sizes = + detail::get_split_chunk_sizes(self, split_sizes, dim); + const auto num_chunks = detail::get_num_chunks(self, dim); + + // Calculate the number of blocks required for the first chunk across all + // splits, assuming each thread only processes BYTES_PER_THREAD bytes. + int64_t num_blocks = 0; + for (const auto& split_chunk_size : split_chunk_sizes) { + num_blocks += detail::div_up( + split_chunk_size, detail::BLOCK_SIZE * detail::BYTES_PER_THREAD); + } + + // Calculate the maximum number of blocks to launch. Only consider + // maxThreadsPerMultiProcessor as a limiting factor as the kernel uses no + // shared memory and little registers. Over-subscribe the SMs to hide I/O + // latency. + const auto num_sms = + at::cuda::getCurrentDeviceProperties()->multiProcessorCount; + const auto max_threads_per_sm = + at::cuda::getCurrentDeviceProperties()->maxThreadsPerMultiProcessor; + const int64_t max_blocks = + num_sms * max_threads_per_sm / detail::BLOCK_SIZE * 2.0; + + // Make each thread process BYTES_PER_THREAD * iter_factor bytes to regulate + // block size. Spread iter_factor evenly between chunks_per_block and + // iters_per_chunk. + int64_t iter_factor = detail::div_up(num_blocks * num_chunks, max_blocks); + int64_t chunks_per_block = std::ceil(std::sqrt(iter_factor)); + chunks_per_block = std::min(chunks_per_block, num_chunks); + const int64_t iters_per_chunk = detail::div_up(iter_factor, chunks_per_block); + + // Launch a logically jagged grid of shape + // (chunk_size*, num_splits, num_chunks / chunks_per_block) + // backed by a physical grid of shape + // (sum(chunk_size), num_chunks / chunks_per_block). + // A block can find its split_idx via block_idx_to_split_idx. + std::vector block_idx_to_split_idx; + std::vector blocks_cumsums{0}; + block_idx_to_split_idx.reserve(num_blocks); + for (size_t split_idx = 0; split_idx < split_sizes.size(); ++split_idx) { + const auto blocks = detail::div_up( + split_chunk_sizes[split_idx], + detail::BLOCK_SIZE * detail::BYTES_PER_THREAD * iters_per_chunk); + block_idx_to_split_idx.insert( + block_idx_to_split_idx.end(), blocks, split_idx); + blocks_cumsums.push_back(blocks_cumsums.back() + blocks); + } + + dim3 blocks(blocks_cumsums.back(), num_chunks / chunks_per_block, 1); + dim3 threads(detail::BLOCK_SIZE, 1, 1); + + auto [_, ptrs] = detail::pack_vecs( + {&dst_base_addrs, + &src_base_addrs, + &split_chunk_sizes, + &block_idx_to_split_idx, + &blocks_cumsums}, + device); + + detail::split_with_sizes_copy_out_contiguous_no_cast_kernel<<< + blocks, + threads, + 0, + at::cuda::getCurrentCUDAStream()>>>( + /*dst_base_addrs=*/reinterpret_cast(ptrs[0]), + /*src_base_addrs=*/reinterpret_cast(ptrs[1]), + /*split_chunk_sizes=*/ptrs[2], + /*block_idx_to_split_idx=*/ptrs[3], + /*blocks_cumsums=*/ptrs[4], + src_stride, + num_chunks); + C10_CUDA_KERNEL_LAUNCH_CHECK(); +} + +void split_with_sizes_copy_out_cuda( + const Tensor& self, + IntArrayRef split_sizes, + int64_t dim, + TensorList out) { + const bool is_capturing = at::cuda::currentStreamCaptureStatusMayInitCtx() != + at::cuda::CaptureStatus::None; + bool contiguous_no_cast = self.is_non_overlapping_and_dense(); + for (const auto& t : out) { + contiguous_no_cast &= t.is_non_overlapping_and_dense(); + contiguous_no_cast &= (t.dtype() == self.dtype()); + } + // TODO(yifu): make the fast path work for CUDA graph + if (!is_capturing && contiguous_no_cast) { + // Perform equivalent checks performed by the composite impl + if (dim < 0) { + dim = at::maybe_wrap_dim(dim, self.dim()); + } + TORCH_CHECK( + self.dim() != 0, "split expects at least a 1-dimensional tensor") + + const int64_t dim_size = self.size(dim); + int64_t split_sizes_sum = 0; + for (const auto i : c10::irange(split_sizes.size())) { + TORCH_CHECK( + split_sizes[i] >= 0, + "split_with_sizes expects split_sizes have only non-negative ", + "entries, but got split_sizes=", + split_sizes[i]); + split_sizes_sum += split_sizes[i]; + } + TORCH_CHECK( + split_sizes_sum == dim_size, + "split_with_sizes expects split_sizes to sum exactly to ", + dim_size, + " (input tensor's size at dimension ", + dim, + "), ", + "but got split_sizes=", + split_sizes); + + TORCH_CHECK( + out.size() == split_sizes.size(), + "split_with_sizes_copy_out() expected an out= argument of size ", + split_sizes.size(), + ", got size ", + out.size()); + + auto out_shape = self.sizes().vec(); + for (const auto i : c10::irange(split_sizes.size())) { + out_shape[dim] = split_sizes[i]; + if (resize_output_check(out[i], out_shape)) { + out[i].resize_(out_shape); + } + TORCH_CHECK( + out[i].dtype() == self.dtype(), + "Expected out tensor to have dtype ", + self.dtype(), + ", but got ", + out[i].dtype(), + " instead"); + TORCH_CHECK( + out[i].device() == self.device(), + "Expected out tensor to have device ", + self.device(), + ", but got ", + out[i].device(), + " instead"); + } + split_with_sizes_copy_out_cuda_contiguous_no_cast( + self, split_sizes, dim, out); + } else { + at::native::split_with_sizes_copy_out(self, split_sizes, dim, out); + } +} + +Tensor _chunk_cat_cuda(TensorList tensors, int64_t dim, int64_t num_chunks) { + dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks); + if (detail::all_contiguous(tensors)) { + // Return a tensor with the same dtype as input tensors + int64_t elem_size = tensors[0].element_size(); + int64_t chunk_size = + detail::get_chunk_size(tensors, dim, num_chunks, elem_size); + int64_t leading_dim = detail::get_leading_dim(tensors[0].sizes(), dim); + auto view_sizes = detail::get_chunk_cat_out_sizes( + tensors[0].sizes(), dim, num_chunks, chunk_size, elem_size); + Tensor out = + tensors[0] + .new_empty(chunk_size * num_chunks * leading_dim / elem_size) + .view(view_sizes); + // Type-agnostic copy since out and input tensors have the same type. + detail::_chunk_cat_out_cuda_contiguous( + tensors, dim, num_chunks, out, elem_size, elem_size); + return out; + } else { + return at::native::_chunk_cat(tensors, dim, num_chunks); + } +} + +Tensor& _chunk_cat_out_cuda( + TensorList tensors, + int64_t dim, + int64_t num_chunks, + Tensor& out) { + dim = at::native::preprocess_chunk_cat_inputs(tensors, dim, num_chunks); + TORCH_CHECK( + tensors[0].device() == out.device(), + "_chunk_cat_out_cuda: mismatch between input and out tensor devices"); + bool both_input_output_contiguous = + detail::all_contiguous(tensors) && out.is_non_overlapping_and_dense(); + if (both_input_output_contiguous && + (tensors[0].dtype() == at::ScalarType::BFloat16) && + (out.dtype() == at::ScalarType::Float)) { + // _chunk_cat_out_cuda_contiguous should also support other types, thanks to + // static_cast_with_inter_type. Here, we dispatch to BFloat16 in and float32 + // out since it is the only known use case. + detail::_chunk_cat_out_cuda_contiguous( + tensors, + dim, + num_chunks, + out, + out.element_size(), + tensors[0].element_size()); + } else if ( + both_input_output_contiguous && tensors[0].dtype() == out.dtype()) { + // Type-agnostic copy since out and input tensors have the same type. + detail::_chunk_cat_out_cuda_contiguous( + tensors, + dim, + num_chunks, + out, + out.element_size(), + tensors[0].element_size()); + } else { + at::native::_chunk_cat_out(tensors, dim, num_chunks, out); + } + return out; +} + +} // namespace at::native diff --git a/aten/src/ATen/native/cuda/TensorTopK.cpp b/aten/src/ATen/native/cuda/TensorTopK.cpp index 36e45d4dae2a0..f44cdcdcea2c5 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cpp +++ b/aten/src/ATen/native/cuda/TensorTopK.cpp @@ -26,8 +26,7 @@ void topk_out_with_sort( const Tensor& values, const Tensor& indices ) { - Tensor sorted_values, sorted_indices; - std::tie(sorted_values, sorted_indices) = at::cuda::sort(self, /* stable= */false, dim, largest); + auto [sorted_values, sorted_indices] = at::cuda::sort(self, /* stable= */false, dim, largest); values.copy_(sorted_values.narrow(dim, 0, k)); indices.copy_(sorted_indices.narrow(dim, 0, k)); } diff --git a/aten/src/ATen/native/cuda/TensorTopK.cu b/aten/src/ATen/native/cuda/TensorTopK.cu index bd48c9b058084..d06efa6635131 100644 --- a/aten/src/ATen/native/cuda/TensorTopK.cu +++ b/aten/src/ATen/native/cuda/TensorTopK.cu @@ -37,7 +37,7 @@ struct AddOp { template C10_LAUNCH_BOUNDS_1(1024) -__global__ void gatherTopK(at::cuda::detail::TensorInfo input, +__global__ void gatherTopK(at::cuda::detail::TensorInfo input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` bool largest, @@ -65,13 +65,13 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo input, // Find the start offset for our slice IndexType sliceStartIndex = - at::cuda::detail::IndexToOffset::get(slice, input); + at::cuda::detail::IndexToOffset::get(slice, input); IndexType topKSliceStartIndex = at::cuda::detail::IndexToOffset::get(slice, topK); IndexType indicesSliceStartIndex = at::cuda::detail::IndexToOffset::get(slice, indices); - T* inputSliceStart = &input.data[sliceStartIndex]; + const T* inputSliceStart = &input.data[sliceStartIndex]; T* topKSliceStart = &topK.data[topKSliceStartIndex]; int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; @@ -179,7 +179,7 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo input, template void launch( - at::cuda::detail::TensorInfo input, + at::cuda::detail::TensorInfo input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` bool largest, @@ -247,7 +247,7 @@ __global__ void fill(T* x, T value, IndexType size) { template C10_LAUNCH_BOUNDS_1(BLOCK_THREADS) __global__ void radixFindKthValues( - at::cuda::detail::TensorInfo input, + at::cuda::detail::TensorInfo input, uint32_t slice_size, uint32_t* ks_to_find, // size: num_slices @@ -277,8 +277,8 @@ __global__ void radixFindKthValues( Bitwise desired = desires[slice_idx]; uint32_t k_to_find = ks_to_find[slice_idx]; - IndexType slice_start_index = at::cuda::detail::IndexToOffset::get(slice_idx, input); - T* data = &input.data[slice_start_index]; + IndexType slice_start_index = at::cuda::detail::IndexToOffset::get(slice_idx, input); + const T* data = &input.data[slice_start_index]; typedef cub::BlockScan BlockScan; static_assert(MAX_ITEMS_PER_THREAD * BLOCK_THREADS < std::numeric_limits::max(), @@ -300,7 +300,7 @@ __global__ void radixFindKthValues( ? items_per_thread : at::ceil_div((int64_t)(slice_size - blk_idx_in_slice * items_per_block), (int64_t)BLOCK_THREADS); - // collect digit counts and store in shared memorey + // collect digit counts and store in shared memory for (int i = 0; i < items_per_thread; ++i) { // Find the start offset for this slice IndexType idx = blk_idx_in_slice * items_per_block + i * BLOCK_THREADS + tidx; @@ -493,7 +493,7 @@ __global__ void computeBlockwiseKthCounts( template C10_LAUNCH_BOUNDS_1(BLOCK_THREADS) -__global__ void gatherTopK(at::cuda::detail::TensorInfo input, +__global__ void gatherTopK(at::cuda::detail::TensorInfo input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` bool largest, @@ -537,13 +537,13 @@ __global__ void gatherTopK(at::cuda::detail::TensorInfo input, // Find the start offset for our slice IndexType sliceStartIndex = - at::cuda::detail::IndexToOffset::get(slice_idx, input); + at::cuda::detail::IndexToOffset::get(slice_idx, input); IndexType topKSliceStartIndex = at::cuda::detail::IndexToOffset::get(slice_idx, topK); IndexType indicesSliceStartIndex = at::cuda::detail::IndexToOffset::get(slice_idx, indices); - T* inputSliceStart = &input.data[sliceStartIndex]; + const T* inputSliceStart = &input.data[sliceStartIndex]; T* topKSliceStart = &topK.data[topKSliceStartIndex]; int64_t* indicesSliceStart = &indices.data[indicesSliceStartIndex]; @@ -640,7 +640,7 @@ public: template void launch( - at::cuda::detail::TensorInfo input, + at::cuda::detail::TensorInfo input, IndexType inputSliceSize, IndexType outputSliceSize, // aka `k` bool largest, @@ -836,8 +836,8 @@ void launch_gather_topk_kernel( #define RUN_T(INDEX_T) \ AT_DISPATCH_ALL_TYPES_AND2(at::ScalarType::Half, at::ScalarType::BFloat16, input.scalar_type(), "topk_out_cuda", [&] { \ - at::cuda::detail::TensorInfo inputInfo = \ - at::cuda::detail::getTensorInfo(input); \ + at::cuda::detail::TensorInfo inputInfo = \ + at::cuda::detail::getTensorInfo(input); \ at::cuda::detail::TensorInfo topKInfo = \ at::cuda::detail::getTensorInfo(values); \ at::cuda::detail::TensorInfo indicesInfo = \ diff --git a/aten/src/ATen/native/cuda/TriangularOps.cu b/aten/src/ATen/native/cuda/TriangularOps.cu index e7ab3a44ddf35..efc79672f5621 100644 --- a/aten/src/ATen/native/cuda/TriangularOps.cu +++ b/aten/src/ATen/native/cuda/TriangularOps.cu @@ -19,33 +19,53 @@ #include +#define BOOL_SWITCH(COND, CONST_NAME, ...) \ + [&] { \ + if (COND) { \ + constexpr static bool CONST_NAME = true; \ + return __VA_ARGS__(); \ + } else { \ + constexpr static bool CONST_NAME = false; \ + return __VA_ARGS__(); \ + } \ + }() + namespace at::native { // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ triu/tril ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -template -C10_LAUNCH_BOUNDS_1(cuda::getApplyBlockSize()) +constexpr static int block_size = 128; + +template +C10_LAUNCH_BOUNDS_1(block_size) __global__ void triu_tril_kernel( cuda::detail::TensorInfo result_info, - const cuda::detail::TensorInfo self_info, + const cuda::detail::TensorInfo self_info, const int64_t k, - const int64_t N) { - int64_t linear_idx = blockIdx.x * blockDim.x + threadIdx.x; - if (linear_idx >= N) { + const int64_t N_padded, + const IndexType last_dim_padded) { + int64_t linear_idx = (blockIdx.x * blockDim.x + threadIdx.x) * elements_per_thread; + if (linear_idx >= N_padded) { return; } auto dims = self_info.dims; + // Compute column index amd row index + IndexType col = linear_idx % last_dim_padded; + linear_idx /= last_dim_padded; + IndexType row = linear_idx % self_info.sizes[dims - 2]; + + if constexpr (inplace) { + bool mask_all_true = upper ? (col - row >= k) : (col + elements_per_thread - row <= k); + if (mask_all_true) + return; + } + + // Compute offset IndexType self_offset = 0, result_offset = 0; - // Compute column index and corresponding offset - IndexType col = linear_idx % self_info.sizes[dims - 1]; - linear_idx /= self_info.sizes[dims - 1]; self_offset += self_info.strides[dims - 1] * col; result_offset += result_info.strides[dims - 1] * col; - - // Compute row index and corresponding offset - IndexType row = linear_idx % self_info.sizes[dims - 2]; linear_idx /= self_info.sizes[dims - 2]; self_offset += self_info.strides[dims - 2] * row; result_offset += result_info.strides[dims - 2] * row; @@ -60,34 +80,65 @@ __global__ void triu_tril_kernel( result_offset += running_index * result_info.strides[i]; } - bool mask = upper ? (col - row >= k) : (col - row <= k); - result_info.data[result_offset] = mask ? self_info.data[self_offset] : scalar_t(0); + if constexpr (inplace) { + #pragma unroll + for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++) { + bool mask = upper ? (col + i - row >= k) : (col + i - row <= k); + if (!mask) + result_info.data[result_offset + i * result_info.strides[dims - 1]] = scalar_t(0); + } + } else { + scalar_t frag[elements_per_thread] = {}; + bool has_mask = (upper && col + elements_per_thread - row >= k) || (!upper && col - row <= k); + if (has_mask) { + #pragma unroll + for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++) + frag[i] = self_info.data[self_offset + i * self_info.strides[dims - 1]]; + + #pragma unroll + for (int i = 0; i < elements_per_thread; i++) { + bool mask = upper ? (col + i - row >= k) : (col + i - row <= k); + frag[i] = mask ? frag[i] : scalar_t(0); + } + } + + #pragma unroll + for (int i = 0; i < elements_per_thread && col + i < self_info.sizes[dims - 1]; i++) + result_info.data[result_offset + i * result_info.strides[dims - 1]] = frag[i]; + } } template void triu_tril_cuda_template(const Tensor& result, const Tensor& self, int64_t k, const char* name) { - int64_t N = self.numel(); - dim3 dim_block = cuda::getApplyBlock(); - dim3 dim_grid((N + dim_block.x - 1) / dim_block.x); AT_DISPATCH_ALL_TYPES_AND_COMPLEX_AND4( at::ScalarType::ComplexHalf, at::ScalarType::Half, at::ScalarType::BFloat16, at::ScalarType::Bool, self.scalar_type(), "triu_tril_cuda_template", [&] { + constexpr int elements_per_thread = sizeof(scalar_t) < 8 ? 8 / sizeof(scalar_t) : 1; + auto sizes = self.sizes(); + int64_t last_dim_padded = round_up(sizes.back(), elements_per_thread); + int64_t N_padded = c10::multiply_integers(sizes.begin(), sizes.end() - 1) * last_dim_padded; + dim3 dim_block = block_size; + dim3 dim_grid((N_padded / elements_per_thread + dim_block.x - 1) / dim_block.x); if (cuda::detail::canUse32BitIndexMath(result) && cuda::detail::canUse32BitIndexMath(self)) { auto result_info = cuda::detail::getTensorInfo(result); - auto self_info = cuda::detail::getTensorInfo(self); - triu_tril_kernel - <<>>( - result_info, self_info, k, N); + auto self_info = cuda::detail::getTensorInfo(self); + BOOL_SWITCH(self.is_same(result), inplace, [&] { + triu_tril_kernel + <<>>( + result_info, self_info, k, N_padded, last_dim_padded); + }); C10_CUDA_KERNEL_LAUNCH_CHECK(); } else { auto result_info = cuda::detail::getTensorInfo(result); - auto self_info = cuda::detail::getTensorInfo(self); - triu_tril_kernel - <<>>( - result_info, self_info, k, N); + auto self_info = cuda::detail::getTensorInfo(self); + BOOL_SWITCH(self.is_same(result), inplace, [&] { + triu_tril_kernel + <<>>( + result_info, self_info, k, N_padded, last_dim_padded); + }); C10_CUDA_KERNEL_LAUNCH_CHECK(); } }); diff --git a/aten/src/ATen/native/cuda/Unique.cu b/aten/src/ATen/native/cuda/Unique.cu index 31aa3a9cd10e6..e2654be0135f8 100644 --- a/aten/src/ATen/native/cuda/Unique.cu +++ b/aten/src/ATen/native/cuda/Unique.cu @@ -1,6 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include -#include +#include #include #include @@ -99,7 +99,7 @@ std::tuple unique_dim_cuda_template( * For unique_dim, we are taking the unique with respect to a index * tensor, but during the processes, we override the compare and equal * operator by checking the data underlying it instead. After the - * algorithm, we would use index_select to map the resulting indicies + * algorithm, we would use index_select to map the resulting indices * to the result on the actual data. */ @@ -152,9 +152,7 @@ std::tuple unique_dim_cuda_template( ); } - Tensor inverse_indices, counts; - int64_t num_out; - std::tie(inverse_indices, counts, num_out) = compute_unique( + auto [inverse_indices, counts, num_out] = compute_unique( policy, indices_data, num_inp, indices, return_inverse, return_counts, options, [=] __device__ (int64_t a, int64_t b) -> bool { @@ -188,46 +186,45 @@ std::tuple unique_dim_cuda_template( std::tuple _unique_cuda(const Tensor& self, const bool sorted, const bool return_inverse) { - return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { // The current CUDA implementation of unique always sort due to the // lack of hashtable implementation in thrust - Tensor output, inverse; - std::tie(output, inverse, std::ignore) = internal::unique_cuda_template(self, false, return_inverse, false); + auto [output, inverse, _] = internal::unique_cuda_template(self, false, return_inverse, false); return std::make_tuple(output, inverse); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple _unique2_cuda(const Tensor& self, const bool sorted, const bool return_inverse, const bool return_counts) { - return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { // The current CUDA implementation of unique always sort due to the // lack of hashtable implementation in thrust return internal::unique_cuda_template(self, false, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_dim_cuda(const Tensor& self, const int64_t dim, const bool sorted, const bool return_inverse, const bool return_counts) { - return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] { return unique_dim_cuda_template(self, dim, false, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_dim_consecutive_cuda(const Tensor& self, const int64_t dim, const bool return_inverse, const bool return_counts) { - return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique_dim", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique_dim", AT_WRAP([&] { return unique_dim_cuda_template(self, dim, true, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } std::tuple unique_consecutive_cuda(const Tensor& self, const bool return_inverse, const bool return_counts, c10::optional dim) { if (!dim.has_value()) { - return AT_DISPATCH_ALL_TYPES_AND2(kBool, kHalf, self.scalar_type(), "unique", [&] { + return AT_DISPATCH_V2(self.scalar_type(), "unique", AT_WRAP([&] { // The current CUDA implementation of unique always sort due to the // lack of hashtable implementation in thrust return internal::unique_cuda_template(self, true, return_inverse, return_counts); - }); + }), AT_EXPAND(AT_ALL_TYPES), kBool, kHalf, AT_EXPAND(AT_BAREBONES_UNSIGNED_TYPES)); } return unique_dim_consecutive_cuda(self, dim.value(), return_inverse, return_counts); } diff --git a/aten/src/ATen/native/cuda/UniqueCub.cu b/aten/src/ATen/native/cuda/UniqueCub.cu index 38f75f1ee4fee..bbd8673bcf5a6 100644 --- a/aten/src/ATen/native/cuda/UniqueCub.cu +++ b/aten/src/ATen/native/cuda/UniqueCub.cu @@ -84,7 +84,7 @@ std::tuple compute_unique( const dim3 block = dim3(std::min(static_cast(cuda::getApplyBlock().x), num_inp)); dim3 grid; - int curDevice = -1; + c10::DeviceIndex curDevice = -1; c10::cuda::GetDevice(&curDevice); cuda::getApplyGrid(num_inp, grid, curDevice); adjacent_difference_kernel<<>>( @@ -158,12 +158,14 @@ struct UniqueCub { } else { sorted = at::empty(self.sizes(), self.options()); } - scalar_t* sorted_data = sorted.mutable_data_ptr(); Tensor sorted_indices; if (!return_inverse) { if (!consecutive) { - cuda::cub::radix_sort_keys(self.const_data_ptr(), sorted_data, num_inp); + cuda::cub::radix_sort_keys( + self.const_data_ptr(), + sorted.mutable_data_ptr(), + num_inp); } } else { if (!consecutive) { @@ -172,7 +174,7 @@ struct UniqueCub { sorted_indices = at::empty({num_inp}, options); cuda::cub::radix_sort_pairs( self.const_data_ptr(), - sorted_data, + sorted.mutable_data_ptr(), range.const_data_ptr(), sorted_indices.mutable_data_ptr(), num_inp); @@ -286,7 +288,7 @@ struct UniqueCub { C10_CUDA_KERNEL_LAUNCH_CHECK(); } - // Final sync to fix the ouput tensors shape + // Final sync to fix the output tensors shape int num_true = 0; at::cuda::memcpy_and_sync(&num_true, tmp_num_true.get(), sizeof(int), cudaMemcpyDeviceToHost, stream); @@ -333,6 +335,9 @@ INSTANTIATE_UNIQUE_CUDA_TEMPLATE(float); INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int32_t); INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int64_t); INSTANTIATE_UNIQUE_CUDA_TEMPLATE(int16_t); +INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint32_t); +INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint64_t); +INSTANTIATE_UNIQUE_CUDA_TEMPLATE(uint16_t); INSTANTIATE_UNIQUE_CUDA_TEMPLATE(bool); INSTANTIATE_UNIQUE_CUDA_TEMPLATE(at::Half); diff --git a/aten/src/ATen/native/cuda/UpSample.cuh b/aten/src/ATen/native/cuda/UpSample.cuh index 09e460640df8d..b7f97088c5ff3 100644 --- a/aten/src/ATen/native/cuda/UpSample.cuh +++ b/aten/src/ATen/native/cuda/UpSample.cuh @@ -183,7 +183,7 @@ __device__ __forceinline__ static int nearest_neighbor_exact_bw_compute_source_i /* Used by UpSampleBicubic2d.cu */ template __device__ __forceinline__ static scalar_t upsample_get_value_bounded( - const PackedTensorAccessor64& data, + const PackedTensorAccessor64& data, int batch, int channel, int height, diff --git a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu index c96d7dbae7630..6673fe4993f39 100644 --- a/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBicubic2d.cu @@ -26,7 +26,7 @@ __global__ void upsample_bicubic2d_out_frame( const accscalar_t height_scale, const accscalar_t width_scale, const bool align_corners, - const PackedTensorAccessor64 idata, + const PackedTensorAccessor64 idata, PackedTensorAccessor64 odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -102,7 +102,7 @@ __global__ void upsample_bicubic2d_backward_out_frame( const accscalar_t width_scale, const bool align_corners, PackedTensorAccessor64 idata, - const PackedTensorAccessor64 odata) { + const PackedTensorAccessor64 odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; const int batchsize = idata.size(0); @@ -195,7 +195,7 @@ static void upsample_bicubic2d_out_cuda_template( input.scalar_type(), "upsample_bicubic2d_out_frame", [&] { using accscalar_t = at::acc_type; - auto idata = input.packed_accessor64(); + auto idata = input.packed_accessor64(); auto odata = output.packed_accessor64(); // Get scaling factors @@ -252,7 +252,7 @@ static void upsample_bicubic2d_backward_out_cuda_template( using accscalar_t = at::acc_type; auto idata = grad_input.packed_accessor64(); - auto odata = grad_output.packed_accessor64(); + auto odata = grad_output.packed_accessor64(); const accscalar_t rheight = area_pixel_compute_scale( input_height, output_height, align_corners, scales_h); diff --git a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu index 1570853c844aa..3c80cb7877a5c 100644 --- a/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleBilinear2d.cu @@ -37,7 +37,7 @@ __global__ void upsample_bilinear2d_out_frame( const accscalar_t rheight, const accscalar_t rwidth, const bool align_corners, - const PackedTensorAccessor idata, + const PackedTensorAccessor idata, PackedTensorAccessor odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -337,7 +337,7 @@ static void upsample_bilinear2d_out_cuda_template( using accscalar_t = at::acc_type; - auto idata = input.packed_accessor64(); + auto idata = input.packed_accessor64(); auto odata = output.packed_accessor64(); const accscalar_t rheight = area_pixel_compute_scale( @@ -474,7 +474,7 @@ C10_LAUNCH_BOUNDS_1(256) // 256 performs better then 1024 __global__ void upsample_gen2d_aa_out_frame( const accscalar_t height_scale, const accscalar_t width_scale, - const PackedTensorAccessor64 idata, + const PackedTensorAccessor64 idata, PackedTensorAccessor64 odata, const InterpFilter & interp_filter) { @@ -568,7 +568,7 @@ __global__ void upsample_gen2d_aa_backward_out_frame( const accscalar_t height_scale, const accscalar_t width_scale, PackedTensorAccessor64 idata, - const PackedTensorAccessor64 odata, + const PackedTensorAccessor64 odata, const InterpFilter & interp_filter) { const int batchsize = idata.size(0); @@ -704,7 +704,7 @@ static void upsample_gen2d_aa_out_cuda_template( input.scalar_type(), "upsample_bilinear2d_out_frame", [&] { using accscalar_t = at::acc_type; - auto idata = input.packed_accessor64(); + auto idata = input.packed_accessor64(); auto odata = output_c.packed_accessor64(); const accscalar_t height_scale = area_pixel_compute_scale( @@ -807,7 +807,7 @@ static void upsample_gen2d_aa_backward_out_cuda_template( using accscalar_t = at::acc_type; auto idata = grad_input.packed_accessor64(); - auto odata = grad_output.packed_accessor64(); + auto odata = grad_output.packed_accessor64(); const accscalar_t height_scale = area_pixel_compute_scale( input_height, output_height, align_corners, scales_h); diff --git a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu index 54a03ae61b8f8..dfba2f5479071 100644 --- a/aten/src/ATen/native/cuda/UpSampleLinear1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleLinear1d.cu @@ -28,7 +28,7 @@ __global__ void upsample_linear1d_out_frame( const int n, const accscalar_t rwidth, const bool align_corners, - const PackedTensorAccessor64 idata, + const PackedTensorAccessor64 idata, PackedTensorAccessor64 odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -76,7 +76,7 @@ __global__ void upsample_linear1d_out_frame_backward( const accscalar_t rwidth, const bool align_corners, PackedTensorAccessor64 idata, - const PackedTensorAccessor64 odata) { + const PackedTensorAccessor64 odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; const int batchsize = idata.size(0); @@ -143,7 +143,7 @@ static void upsample_linear1d_out_cuda_template( input.scalar_type(), "upsample_linear1d_out_frame", [&] { using accscalar_t = at::acc_type; - auto idata = input.packed_accessor64(); + auto idata = input.packed_accessor64(); auto odata = output.packed_accessor64(); const accscalar_t rwidth = area_pixel_compute_scale( @@ -188,7 +188,7 @@ static void upsample_linear1d_backward_out_cuda_template( using accscalar_t = at::acc_type; auto idata = grad_input.packed_accessor64(); - auto odata = grad_output.packed_accessor64(); + auto odata = grad_output.packed_accessor64(); const accscalar_t rwidth = area_pixel_compute_scale( input_width, output_width, align_corners, scales); diff --git a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu index 6bfeef431c13f..3085cba0a1d16 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest1d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest1d.cu @@ -175,6 +175,7 @@ static void upsample_nearest1d_backward_out_cuda_template( dim3 gdim{ceil_div(n, bdim.x)}; // safe check for int32 indexing; implicitly restrict launch config for kernel TORCH_CHECK(grad_input.numel() <= std::numeric_limits::max()); + TORCH_CHECK(grad_output.numel() <= std::numeric_limits::max()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest1d_backward_out_frame", [&] { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu index ba71fdc0b077f..197fc9d60bef7 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest2d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest2d.cu @@ -122,7 +122,7 @@ __global__ void upsample_nearest2d_backward_out_frame( scalar_t* grad_i, float height_scale, float width_scale) { - int dst_idx = blockIdx.x * blockDim.x + threadIdx.x; + int64_t dst_idx = blockIdx.x * blockDim.x + threadIdx.x; if (dst_idx >= dim_c * dst_dim_h * dst_dim_w) return; @@ -151,7 +151,7 @@ __global__ void upsample_nearest2d_backward_out_frame( accscalar_t grad = 0; for (int y = src_y; y < src_y_up; y++) { for (int x = src_x; x < src_x_up; x++) { - int src_idx = + int64_t src_idx = b * dim_c * src_c_stride + c * src_c_stride + y * src_dim_w + x; grad += grad_o[src_idx]; } @@ -408,8 +408,9 @@ static void upsample_nearest2d_backward_out_cuda_template( dim3 bdim{std::min( at::cuda::getCurrentDeviceProperties()->maxThreadsPerBlock, MAX_THREADS)}; dim3 gdim{ceil_div(n, bdim.x)}; - // safe check for int32 indexing; implicitly restrict launch config for kernel - TORCH_CHECK(grad_input.numel() <= std::numeric_limits::max()); + // safe check for int64 indexing; implicitly restrict launch config for kernel + TORCH_CHECK(grad_input.numel() <= std::numeric_limits::max(), "upsample2d grad_input.numel() <= std::numeric_limits::max()"); + TORCH_CHECK(grad_output.numel() <= std::numeric_limits::max(), "upsample2d grad_output.numel() <= std::numeric_limits::max()"); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest2d_backward_out_frame", [&] { diff --git a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu index f9c1dfdb8ab76..31a7ee92e7488 100644 --- a/aten/src/ATen/native/cuda/UpSampleNearest3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleNearest3d.cu @@ -255,6 +255,7 @@ static void upsample_nearest3d_backward_out_cuda_template( dim3 gdim{ceil_div(n, bdim.x)}; // safe check for int32 indexing; implicitly restrict launch config for kernel TORCH_CHECK(grad_input.numel() <= std::numeric_limits::max()); + TORCH_CHECK(grad_output.numel() <= std::numeric_limits::max()); cudaStream_t stream = at::cuda::getCurrentCUDAStream(); AT_DISPATCH_FLOATING_TYPES_AND3(ScalarType::Half, ScalarType::BFloat16, ScalarType::Byte, grad_output.scalar_type(), "upsample_nearest3d_backward_out_frame", [&] { diff --git a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu index de8e797c6d358..43cc09d34b677 100644 --- a/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu +++ b/aten/src/ATen/native/cuda/UpSampleTrilinear3d.cu @@ -43,7 +43,7 @@ __global__ void upsample_trilinear3d_out_frame( const accscalar_t rheight, const accscalar_t rwidth, const bool align_corners, - const PackedTensorAccessor64 idata, + const PackedTensorAccessor64 idata, PackedTensorAccessor64 odata) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -128,7 +128,7 @@ __global__ void upsample_trilinear3d_backward_out_frame( const accscalar_t rwidth, const bool align_corners, PackedTensorAccessor64 idata, - const PackedTensorAccessor64 odata, + const PackedTensorAccessor64 odata, scalar_t* idata_ptr) { int index = threadIdx.x + blockIdx.x * blockDim.x; @@ -269,7 +269,7 @@ static void upsample_trilinear3d_out_cuda_template( input.scalar_type(), "upsample_trilinear3d_out_frame", [&] { using accscalar_t = at::acc_type; - auto idata = input.packed_accessor64(); + auto idata = input.packed_accessor64(); auto odata = output.packed_accessor64(); const accscalar_t rdepth = area_pixel_compute_scale( @@ -296,7 +296,7 @@ static void upsample_trilinear3d_out_cuda_template( } static void upsample_trilinear3d_backward_out_cuda_template( - const Tensor& grad_input, + const Tensor& grad_input_, const Tensor& grad_output_, IntArrayRef output_size, IntArrayRef input_size, @@ -304,7 +304,7 @@ static void upsample_trilinear3d_backward_out_cuda_template( c10::optional scales_d, c10::optional scales_h, c10::optional scales_w) { - TensorArg grad_input_arg{grad_input, "grad_input", 1}, + TensorArg grad_input_arg{grad_input_, "grad_input_", 1}, grad_output_arg{grad_output_, "grad_output_", 2}; checkAllSameGPU( "upsample_trilinear3d_backward_out_cuda", @@ -321,7 +321,8 @@ static void upsample_trilinear3d_backward_out_cuda_template( Tensor grad_output = grad_output_.contiguous(); // A contiguous tensor is required for the kernel launch config - grad_input.contiguous(); + Tensor grad_input = grad_input_.contiguous(); + // Numbers are added atomically to grad_input tensor from multiple threads, // so it has to be initialized to zero. grad_input.zero_(); @@ -339,7 +340,7 @@ static void upsample_trilinear3d_backward_out_cuda_template( using accscalar_t = at::acc_type; auto idata = grad_input.packed_accessor64(); - auto odata = grad_output.packed_accessor64(); + auto odata = grad_output.packed_accessor64(); scalar_t* idata_ptr = grad_input.mutable_data_ptr(); const accscalar_t rdepth = area_pixel_compute_scale( @@ -363,6 +364,10 @@ static void upsample_trilinear3d_backward_out_cuda_template( odata, idata_ptr); C10_CUDA_KERNEL_LAUNCH_CHECK(); + + if (!grad_input_.is_contiguous()) { + grad_input_.copy_(grad_input); + } }); } diff --git a/aten/src/ATen/native/cuda/block_reduce.cuh b/aten/src/ATen/native/cuda/block_reduce.cuh index fa75c71f8acaf..e8fd69c0aec93 100644 --- a/aten/src/ATen/native/cuda/block_reduce.cuh +++ b/aten/src/ATen/native/cuda/block_reduce.cuh @@ -16,7 +16,7 @@ constexpr int kCUDABlockReduceNumThreads = 512; // NOTE: This is >= the max block size on current hardware anyway (1024). constexpr int kCUDABlockReduceMaxThreads = C10_WARP_SIZE * C10_WARP_SIZE; -// Sums `val` accross all threads in a warp. +// Sums `val` across all threads in a warp. // // Assumptions: // - The size of each block should be a multiple of `C10_WARP_SIZE` @@ -29,6 +29,19 @@ __inline__ __device__ T WarpReduceSum(T val) { return val; } +// Picks the maximum `val` across all threads in a warp. +// +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +template +__inline__ __device__ T WarpReduceMax(T val) { +#pragma unroll + for (int offset = (C10_WARP_SIZE >> 1); offset > 0; offset >>= 1) { + val = max_propagate_nan(val, WARP_SHFL_DOWN(val, offset)); + } + return val; +} + struct Block1D { static __forceinline__ __device__ int Tid() { return threadIdx.x; } @@ -72,6 +85,31 @@ __inline__ __device__ T BlockReduceSum(T val, T* shared) { return val; } +// Picks out the maximum `val` across all threads in a block. +// +// Warning: the return value is only valid for thread 0. +// Assumptions: +// - The size of each block should be a multiple of `C10_WARP_SIZE` +// - `shared` should be a pointer to shared memory with size of, at least, +// `sizeof(T) * number_of_warps` +template +__inline__ __device__ T BlockReduceMax(T val, T* shared) { + const int tid = B::Tid(); + const int lid = tid % C10_WARP_SIZE; + const int wid = tid / C10_WARP_SIZE; + val = WarpReduceMax(val); + __syncthreads(); // prevent races when BlockReduces are called in a row. + if (lid == 0) { + shared[wid] = val; + } + __syncthreads(); + val = (tid < B::Warps()) ? shared[lid] : T(0); + if (wid == 0) { + val = WarpReduceMax(val); + } + return val; +} + template __inline__ __device__ T WarpReduce(T val, const ReduceOp& op) { #pragma unroll diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h index a800dbaeaa2d6..0d1b0bd8e7a5e 100644 --- a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h +++ b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_base.h @@ -113,7 +113,7 @@ class DqMmaBase { /// Shape describing the number of warps filling the CTA using WarpCount = GemmShape; - /// Number of warp-level GEMM oeprations + /// Number of warp-level GEMM operations static int const kWarpGemmIterations = (WarpGemm::kK / Operator::Policy::MmaShape::kK); static constexpr int kNumKIterationsPerWarpBLoad = diff --git a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h index 6395713824347..6517e1a4f7a13 100644 --- a/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h +++ b/aten/src/ATen/native/cuda/cutlass_extensions/gemm/threadblock/dq_mma_pipelined.h @@ -149,7 +149,7 @@ class DqMmaPipelined: public DqMmaBase @@ -426,4 +426,4 @@ struct FastInterleavedAndBiasedNumericArrayConverter { } // namespace cutlass -///////////////////////////////////////////////////////////////////////////////////////////////// \ No newline at end of file +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu index d970914dbc294..9cebb82e512a8 100644 --- a/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adam_amsgrad_impl.cu @@ -30,11 +30,11 @@ void _fused_adam_amsgrad_cuda_impl_( exp_avg_sqs.vec(), max_exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = nullptr; + const float* lr_ptr = nullptr; AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -45,7 +45,7 @@ void _fused_adam_amsgrad_cuda_impl_( multi_tensor_apply_for_fused_optimizer<5>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, // unused lr, beta1, @@ -53,10 +53,8 @@ void _fused_adam_amsgrad_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ true, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ORIGINAL); + found_inf_ptr); }); } @@ -83,11 +81,11 @@ void _fused_adam_amsgrad_cuda_impl_( exp_avg_sqs.vec(), max_exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = lr.data_ptr(); + const float* lr_ptr = lr.const_data_ptr(); AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -98,7 +96,7 @@ void _fused_adam_amsgrad_cuda_impl_( multi_tensor_apply_for_fused_optimizer<5>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, 1.0, // unused beta1, @@ -106,10 +104,8 @@ void _fused_adam_amsgrad_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ true, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ORIGINAL); + found_inf_ptr); }); } diff --git a/aten/src/ATen/native/cuda/fused_adam_impl.cu b/aten/src/ATen/native/cuda/fused_adam_impl.cu index 075dd38f3aaed..7f2843b3b4ee4 100644 --- a/aten/src/ATen/native/cuda/fused_adam_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adam_impl.cu @@ -25,11 +25,11 @@ void _fused_adam_cuda_impl_( std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = nullptr; + const float* lr_ptr = nullptr; AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -40,7 +40,7 @@ void _fused_adam_cuda_impl_( multi_tensor_apply_for_fused_optimizer<4>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, // unused lr, beta1, @@ -48,10 +48,8 @@ void _fused_adam_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ false, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ORIGINAL); + found_inf_ptr); }); } @@ -73,11 +71,11 @@ void _fused_adam_cuda_impl_( std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = lr.data_ptr(); + const float* lr_ptr = lr.const_data_ptr(); AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -88,7 +86,7 @@ void _fused_adam_cuda_impl_( multi_tensor_apply_for_fused_optimizer<4>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, 1.0, // unused beta1, @@ -96,10 +94,8 @@ void _fused_adam_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ false, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ORIGINAL); + found_inf_ptr); }); } diff --git a/aten/src/ATen/native/cuda/fused_adam_utils.cuh b/aten/src/ATen/native/cuda/fused_adam_utils.cuh index 25de84ee7c971..182195969ed9a 100644 --- a/aten/src/ATen/native/cuda/fused_adam_utils.cuh +++ b/aten/src/ATen/native/cuda/fused_adam_utils.cuh @@ -3,6 +3,7 @@ #include #include #include +#include namespace at { namespace native { @@ -17,20 +18,25 @@ constexpr uint8_t kExpAvgIdx = 2; constexpr uint8_t kExpAvgSqIdx = 3; constexpr uint8_t kMaxExpAvgSqIdx = 4; -template -C10_DEVICE __forceinline__ void adam_math( +template < + typename scalar_type, + typename opmath_t, + int depth, + ADAM_MODE adam_mode, + bool amsgrad> +C10_DEVICE inline void adam_math( scalar_type r_args[depth][kILP], - const float* step_count, - const double lr, - const double beta1, - const double beta2, - const double weight_decay, - const double eps, - const bool maximize, - const bool amsgrad, + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, const float* grad_scale_ptr, const float* found_inf_ptr, - const ADAM_MODE adam_mode) { + const opmath_t& bias_correction1, + const opmath_t& bias_correction2_sqrt) { + static_assert(depth == 4 || depth == 5); #pragma unroll for (int ii = 0; ii < kILP; ii++) { // Load values. @@ -51,23 +57,17 @@ C10_DEVICE __forceinline__ void adam_math( } // Update param, grad, 1st and 2nd order momentum. if (weight_decay != 0) { - switch (adam_mode) { - case ADAM_MODE::ORIGINAL: - grad += param * weight_decay; - break; - case ADAM_MODE::ADAMW: - param -= lr * weight_decay * param; - break; + if constexpr (adam_mode == ADAM_MODE::ORIGINAL) { + grad += param * weight_decay; + } else if constexpr (adam_mode == ADAM_MODE::ADAMW) { + param -= lr * weight_decay * param; } } // todo(crcrpar): use lerp // ref: https://developer.nvidia.com/blog/lerp-faster-cuda/ exp_avg = beta1 * exp_avg + (1 - beta1) * grad; exp_avg_sq = beta2 * exp_avg_sq + (1 - beta2) * grad * grad; - const opmath_t bias_correction1 = 1 - at::native::pow_(beta1, *step_count); const opmath_t step_size = lr / bias_correction1; - const opmath_t bias_correction2 = 1 - at::native::pow_(beta2, *step_count); - const opmath_t bias_correction2_sqrt = std::sqrt(bias_correction2); opmath_t denom; if (amsgrad) { max_exp_avg_sq = std::max(max_exp_avg_sq, exp_avg_sq); @@ -102,7 +102,7 @@ C10_DEVICE __forceinline__ void adam_math( // parameter updates accordingly. To be functionally on par with `torch.optim` // optimizers and `_multi_tensor` ones, the kernel below writes out gradients // only when `grad_scale_ptr != nullptr. -template +template struct FusedAdamMathFunctor { static_assert( depth == 4 || depth == 5, @@ -112,33 +112,37 @@ struct FusedAdamMathFunctor { int chunk_size, FusedOptimizerTensorListMetadata& tl, const float* lr_ptr, - const double lr, - const double beta1, - const double beta2, - const double weight_decay, - const double eps, - const bool maximize, - const bool amsgrad, + const double& lr, + const double& beta1, + const double& beta2, + const double& weight_decay, + const double& eps, + const bool& maximize, const float* grad_scale_ptr, - const float* found_inf_ptr, - const ADAM_MODE adam_mode) { - int tensor_loc = tl.block_to_tensor[blockIdx.x]; - int chunk_idx = tl.block_to_chunk[blockIdx.x]; - int n = tl.numel_for_tensor[tensor_loc]; - double lr_double = lr_ptr ? *lr_ptr : lr; + const float* found_inf_ptr) { + const auto tensor_loc = tl.block_to_tensor[blockIdx.x]; + const auto chunk_idx = tl.block_to_chunk[blockIdx.x]; + const double lr_double = lr_ptr ? *lr_ptr : lr; if (found_inf_ptr && *found_inf_ptr == 1) { return; } - auto* step_count = - reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto [bias_correction1, bias_correction2_sqrt] = + [&]() -> std::pair { + auto* step_count = + reinterpret_cast(tl.state_steps_addresses[tensor_loc]); + const auto bias_correction1 = 1 - at::native::pow_(beta1, *step_count); + const auto bias_correction2 = 1 - at::native::pow_(beta2, *step_count); + const auto bias_correction2_sqrt = std::sqrt(bias_correction2); + return {bias_correction1, bias_correction2_sqrt}; + }(); scalar_type* args[depth]; - const bool all_aligned{ - init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; - n -= chunk_idx * chunk_size; scalar_type r_args[depth][kILP]; + const auto n = tl.numel_for_tensor[tensor_loc] - chunk_idx * chunk_size; + const bool all_aligned{ + init_args(args, tl, chunk_idx, chunk_size, tensor_loc)}; if ((n % kILP == 0) && (chunk_size % kILP == 0) && all_aligned) { for (int64_t i_start = threadIdx.x; i_start * kILP < n && i_start * kILP < chunk_size; @@ -147,19 +151,18 @@ struct FusedAdamMathFunctor { for (int i = 0; i < depth; i++) { load_store(r_args[i], args[i], 0, i_start); } - adam_math( + adam_math( r_args, - step_count, lr_double, beta1, beta2, weight_decay, eps, maximize, - amsgrad, grad_scale_ptr, found_inf_ptr, - adam_mode); + bias_correction1, + bias_correction2_sqrt); #pragma unroll for (int i = 0; i < depth; i++) { if (i != kGradIdx || grad_scale_ptr) { @@ -171,19 +174,18 @@ struct FusedAdamMathFunctor { for (int64_t i_start = 0; i_start < n && i_start < chunk_size; i_start += blockDim.x * kILP) { load_args(r_args, args, i_start, chunk_size, n); - adam_math( + adam_math( r_args, - step_count, lr_double, beta1, beta2, weight_decay, eps, maximize, - amsgrad, grad_scale_ptr, found_inf_ptr, - adam_mode); + bias_correction1, + bias_correction2_sqrt); #pragma unroll for (int i = 0; i < depth; i++) { if (i != kGradIdx || grad_scale_ptr) { diff --git a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu index 91f6619973317..376711c39db6d 100644 --- a/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adamw_amsgrad_impl.cu @@ -31,11 +31,11 @@ void _fused_adamw_amsgrad_cuda_impl_( exp_avg_sqs.vec(), max_exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = nullptr; + const float* lr_ptr = nullptr; AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -46,7 +46,7 @@ void _fused_adamw_amsgrad_cuda_impl_( multi_tensor_apply_for_fused_optimizer<5>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, // unused lr, beta1, @@ -54,10 +54,8 @@ void _fused_adamw_amsgrad_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ true, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ADAMW); + found_inf_ptr); }); } @@ -84,11 +82,11 @@ void _fused_adamw_amsgrad_cuda_impl_( exp_avg_sqs.vec(), max_exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = lr.data_ptr(); + const float* lr_ptr = lr.const_data_ptr(); AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -99,7 +97,7 @@ void _fused_adamw_amsgrad_cuda_impl_( multi_tensor_apply_for_fused_optimizer<5>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, 1.0, // unused beta1, @@ -107,10 +105,8 @@ void _fused_adamw_amsgrad_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ true, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ADAMW); + found_inf_ptr); }); } diff --git a/aten/src/ATen/native/cuda/fused_adamw_impl.cu b/aten/src/ATen/native/cuda/fused_adamw_impl.cu index 847f05671360d..cc4feaa145122 100644 --- a/aten/src/ATen/native/cuda/fused_adamw_impl.cu +++ b/aten/src/ATen/native/cuda/fused_adamw_impl.cu @@ -26,11 +26,11 @@ void _fused_adamw_cuda_impl_( std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = nullptr; + const float* lr_ptr = nullptr; AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -41,7 +41,7 @@ void _fused_adamw_cuda_impl_( multi_tensor_apply_for_fused_optimizer<4>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, // unused lr, beta1, @@ -49,10 +49,8 @@ void _fused_adamw_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ false, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ADAMW); + found_inf_ptr); }); } @@ -74,11 +72,11 @@ void _fused_adamw_cuda_impl_( std::vector> tensor_lists{ params.vec(), grads.vec(), exp_avgs.vec(), exp_avg_sqs.vec()}; - float* grad_scale_ptr = + const float* grad_scale_ptr = grad_scale.has_value() ? grad_scale->data_ptr() : nullptr; - float* found_inf_ptr = + const float* found_inf_ptr = found_inf.has_value() ? found_inf->data_ptr() : nullptr; - float* lr_ptr = lr.data_ptr(); + const float* lr_ptr = lr.const_data_ptr(); AT_DISPATCH_FLOATING_TYPES_AND2( kHalf, @@ -89,7 +87,7 @@ void _fused_adamw_cuda_impl_( multi_tensor_apply_for_fused_optimizer<4>( tensor_lists, state_steps, - FusedAdamMathFunctor(), + FusedAdamMathFunctor(), lr_ptr, 1.0, // unused beta1, @@ -97,10 +95,8 @@ void _fused_adamw_cuda_impl_( weight_decay, eps, maximize, - /* amsgrad */ false, grad_scale_ptr, - found_inf_ptr, - ADAM_MODE::ADAMW); + found_inf_ptr); }); } diff --git a/aten/src/ATen/native/cuda/group_norm_kernel.cu b/aten/src/ATen/native/cuda/group_norm_kernel.cu index 5a29338c303e0..f3ed79e745382 100644 --- a/aten/src/ATen/native/cuda/group_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/group_norm_kernel.cu @@ -496,11 +496,11 @@ void GroupNorm1dForward( auto iter = TensorIteratorConfig() .resize_outputs(false) .add_owned_output(Y.view({N, G, D})) - .add_owned_input(X.view({N, G, D})) + .add_owned_const_input(X.view({N, G, D})) .add_owned_input(mean.view({N, G, 1})) .add_owned_input(rstd.view({N, G, 1})) - .add_owned_input(gamma.view({1, G, D})) - .add_owned_input(beta.view({1, G, D})) + .add_owned_const_input(gamma.view({1, G, D})) + .add_owned_const_input(beta.view({1, G, D})) .build(); gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma, T beta) -> T { return (static_cast(x) - static_cast(mean)) * @@ -511,10 +511,10 @@ void GroupNorm1dForward( auto iter = TensorIteratorConfig() .resize_outputs(false) .add_owned_output(Y.view({N, G, D})) - .add_owned_input(X.view({N, G, D})) + .add_owned_const_input(X.view({N, G, D})) .add_owned_input(mean.view({N, G, 1})) .add_owned_input(rstd.view({N, G, 1})) - .add_owned_input(gamma.view({1, G, D})) + .add_owned_const_input(gamma.view({1, G, D})) .build(); gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T gamma) -> T { return (static_cast(x) - static_cast(mean)) * @@ -524,10 +524,10 @@ void GroupNorm1dForward( auto iter = TensorIteratorConfig() .resize_outputs(false) .add_owned_output(Y.view({N, G, D})) - .add_owned_input(X.view({N, G, D})) + .add_owned_const_input(X.view({N, G, D})) .add_owned_input(mean.view({N, G, 1})) .add_owned_input(rstd.view({N, G, 1})) - .add_owned_input(beta.view({1, G, D})) + .add_owned_const_input(beta.view({1, G, D})) .build(); gpu_kernel(iter, [] GPU_LAMBDA(T x, T mean, T rstd, T beta) -> T { return (static_cast(x) - static_cast(mean)) * @@ -538,7 +538,7 @@ void GroupNorm1dForward( auto iter = TensorIteratorConfig() .resize_outputs(false) .add_owned_output(Y.view({N * G, D})) - .add_owned_input(X.view({N * G, D})) + .add_owned_const_input(X.view({N * G, D})) .add_owned_input(mean.view({N * G, 1})) .add_owned_input(rstd.view({N * G, 1})) .build(); @@ -590,7 +590,7 @@ void GroupNormKernelImplInternal( auto iter = TensorIteratorConfig() .resize_outputs(false) .add_owned_output(Y.view({N * G, D * HxW})) - .add_owned_input(X.view({N * G, D * HxW})) + .add_owned_const_input(X.view({N * G, D * HxW})) .add_owned_input(mean.view({N * G, 1})) .add_owned_input(rstd.view({N * G, 1})) .build(); @@ -611,7 +611,7 @@ void GroupNormKernelImplInternal( T_ACC* b_data = b.mutable_data_ptr(); // TODO: Since there is some issues in gpu_kernel_multiple_outputs, we are - // using maunal kernel here. Make it using gpu_kernel_multiple_outputs once + // using manual kernel here. Make it using gpu_kernel_multiple_outputs once // the issue fixed. const int64_t B = (N * C + kCUDANumThreads - 1) / kCUDANumThreads; ComputeFusedParamsCUDAKernel<<>>( @@ -622,7 +622,7 @@ void GroupNormKernelImplInternal( .check_all_same_dtype(std::is_same::value) .resize_outputs(false) .add_owned_output(Y.view({N * C, HxW})) - .add_owned_input(X.view({N * C, HxW})) + .add_owned_const_input(X.view({N * C, HxW})) .add_owned_input(a.view({N * C, 1})) .add_owned_input(b.view({N * C, 1})) .build(); @@ -719,12 +719,12 @@ void GroupNorm1dBackward( .check_all_same_dtype(std::is_same::value) .resize_outputs(false) .add_owned_output(dX.view({N, G, D})) - .add_owned_input(dY.view({N, G, D})) - .add_owned_input(X.view({N, G, D})) - .add_owned_input(rstd.view({N, G, 1})) - .add_owned_input(gamma.view({1, G, D})) - .add_owned_input(c2.view({N, G, 1})) - .add_owned_input(c3.view({N, G, 1})) + .add_owned_const_input(dY.view({N, G, D})) + .add_owned_const_input(X.view({N, G, D})) + .add_owned_const_input(rstd.view({N, G, 1})) + .add_owned_const_input(gamma.view({1, G, D})) + .add_owned_const_input(c2.view({N, G, 1})) + .add_owned_const_input(c3.view({N, G, 1})) .build(); gpu_kernel( iter, @@ -739,11 +739,11 @@ void GroupNorm1dBackward( .check_all_same_dtype(std::is_same::value) .resize_outputs(false) .add_owned_output(dX.view({N * G, D})) - .add_owned_input(dY.view({N * G, D})) - .add_owned_input(X.view({N * G, D})) - .add_owned_input(rstd.view({N * G, 1})) - .add_owned_input(c2.view({N * G, 1})) - .add_owned_input(c3.view({N * G, 1})) + .add_owned_const_input(dY.view({N * G, D})) + .add_owned_const_input(X.view({N * G, D})) + .add_owned_const_input(rstd.view({N * G, 1})) + .add_owned_const_input(c2.view({N * G, 1})) + .add_owned_const_input(c3.view({N * G, 1})) .build(); gpu_kernel( iter, [] GPU_LAMBDA(T dy, T x, T rstd, T_ACC c2, T_ACC c3) -> T { @@ -772,7 +772,7 @@ void GroupNorm1dBackward( } else { const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize; // The algorithm for colwise reduction here is to accumulate each 32 cols - // to a 32 * 32 tile and write the tile to shared memmory. Then do warp + // to a 32 * 32 tile and write the tile to shared memory. Then do warp // reduce for each col in the tile. So here the blockDim must be (32, 16). constexpr int kThreadX = kReduceTileSize; constexpr int kThreadY = kReduceTileSize / 2; @@ -865,8 +865,8 @@ void GroupNormBackwardKernelImplInternal( auto iter = TensorIteratorConfig() .check_all_same_dtype(std::is_same::value) .add_output(c1) - .add_owned_input(rstd.view({N, G, 1})) - .add_owned_input(gamma.view({1, G, D})) + .add_owned_const_input(rstd.view({N, G, 1})) + .add_owned_const_input(gamma.view({1, G, D})) .build(); gpu_kernel(iter, [] GPU_LAMBDA(T rstd, T gamma) -> T_ACC { return static_cast(rstd) * static_cast(gamma); @@ -895,11 +895,11 @@ void GroupNormBackwardKernelImplInternal( .check_all_same_dtype(std::is_same::value) .resize_outputs(false) .add_owned_output(dX.view({N * G, D, HxW})) - .add_owned_input(dY.view({N * G, D, HxW})) - .add_owned_input(X.view({N * G, D, HxW})) - .add_owned_input(c1.view({N * G, D, 1})) - .add_owned_input(c2.view({N * G, 1, 1})) - .add_owned_input(c3.view({N * G, 1, 1})) + .add_owned_const_input(dY.view({N * G, D, HxW})) + .add_owned_const_input(X.view({N * G, D, HxW})) + .add_owned_const_input(c1.view({N * G, D, 1})) + .add_owned_const_input(c2.view({N * G, 1, 1})) + .add_owned_const_input(c3.view({N * G, 1, 1})) .build(); gpu_kernel( iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T { @@ -911,11 +911,11 @@ void GroupNormBackwardKernelImplInternal( .check_all_same_dtype(std::is_same::value) .resize_outputs(false) .add_owned_output(dX.view({N * G, D * HxW})) - .add_owned_input(dY.view({N * G, D * HxW})) - .add_owned_input(X.view({N * G, D * HxW})) - .add_owned_input(rstd.view({N * G, 1})) - .add_owned_input(c2.view({N * G, 1})) - .add_owned_input(c3.view({N * G, 1})) + .add_owned_const_input(dY.view({N * G, D * HxW})) + .add_owned_const_input(X.view({N * G, D * HxW})) + .add_owned_const_input(rstd.view({N * G, 1})) + .add_owned_const_input(c2.view({N * G, 1})) + .add_owned_const_input(c3.view({N * G, 1})) .build(); gpu_kernel( iter, [] GPU_LAMBDA(T dy, T x, T_ACC c1, T_ACC c2, T_ACC c3) -> T { @@ -944,7 +944,7 @@ void GroupNormBackwardKernelImplInternal( } else { const int64_t B = (C + kReduceTileSize - 1) / kReduceTileSize; // The algorithm for colwise reduction here is to accumulate each 32 cols - // to a 32 * 32 tile and write the tile to shared memmory. Then do warp + // to a 32 * 32 tile and write the tile to shared memory. Then do warp // reduce for each col in the tile. So here the blockDim must be (32, 16). constexpr int kThreadX = kReduceTileSize; constexpr int kThreadY = kReduceTileSize / 2; diff --git a/aten/src/ATen/native/cuda/im2col.cuh b/aten/src/ATen/native/cuda/im2col.cuh index 06eef13208c67..ec74617de34a1 100644 --- a/aten/src/ATen/native/cuda/im2col.cuh +++ b/aten/src/ATen/native/cuda/im2col.cuh @@ -34,7 +34,7 @@ __global__ void im2col_kernel( const int64_t height_col, const int64_t width_col, dt* data_col) { - CUDA_KERNEL_LOOP(index, n) { + CUDA_KERNEL_LOOP_TYPE(index, n, int64_t) { int64_t w_out = index % width_col; int64_t idx = index / width_col; diff --git a/aten/src/ATen/native/cuda/int4mm.cu b/aten/src/ATen/native/cuda/int4mm.cu index 07a70013b26f4..fcfcd2e5ebbdb 100644 --- a/aten/src/ATen/native/cuda/int4mm.cu +++ b/aten/src/ATen/native/cuda/int4mm.cu @@ -868,7 +868,6 @@ at::Tensor _weight_int4pack_mm_cuda( int64_t qGroupSize, const at::Tensor& qScaleAndZeros) { c10::cuda::CUDAGuard g(A.device()); - auto stream = at::cuda::getCurrentCUDAStream(); TORCH_CHECK( A.device() == B.device() && A.device() == qScaleAndZeros.device()); @@ -926,6 +925,7 @@ at::Tensor _weight_int4pack_mm_cuda( {m, n}, at::TensorOptions().dtype(at::kBFloat16).device(A.device())); #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)) + auto stream = at::cuda::getCurrentCUDAStream(); #define RUN_GEMM(WARPS, K_TILES_PER_WARP, Q_GROUP_SIZE, REDUCE_TYPE) \ do { \ using ACLayout = ALayout_RM; \ @@ -1041,7 +1041,6 @@ at::Tensor _convert_weight_to_int4pack_cuda( const at::Tensor& in, int64_t innerKTiles) { c10::cuda::CUDAGuard g(in.device()); - auto stream = at::cuda::getCurrentCUDAStream(); TORCH_CHECK(in.dim() == 2); TORCH_CHECK(in.dtype() == at::kInt); @@ -1072,6 +1071,7 @@ at::Tensor _convert_weight_to_int4pack_cuda( at::TensorOptions().dtype(at::kInt).device(in.device())); #if (defined(CUDA_VERSION) && CUDA_VERSION >= 12000) && (!defined(__CUDA_ARCH__) || (__CUDA_ARCH__ >= 800)) + auto stream = at::cuda::getCurrentCUDAStream(); dim3 grid(kSuperTiles, nTiles); if (innerKTiles == 2) { diff --git a/aten/src/ATen/native/cuda/jit_utils.cpp b/aten/src/ATen/native/cuda/jit_utils.cpp index 61781e03b4a96..6e804efe5f847 100644 --- a/aten/src/ATen/native/cuda/jit_utils.cpp +++ b/aten/src/ATen/native/cuda/jit_utils.cpp @@ -20,7 +20,7 @@ #include #include -// TODO: C++17 has the fileystem header, which may replace these +// TODO: C++17 has the filesystem header, which may replace these #ifdef _WIN32 // On Windows, the POSIX implementations are considered deprecated. We simply map to the newer variant. #include @@ -1500,7 +1500,11 @@ NvrtcFunction jit_pwise_function( std::stringstream ss; ss << *cache_dir << "/"; ss << kernel_name; +#ifdef USE_ROCM + ss << "_arch" << prop->gcnArchName; +#else ss << "_arch" << cuda_major << "." << cuda_minor; +#endif ss << "_nvrtc" << nvrtc_major << "." << nvrtc_minor; ss << (compile_to_sass ? "_sass" : "_ptx"); ss << "_" << code.length(); @@ -1510,7 +1514,7 @@ NvrtcFunction jit_pwise_function( std::ifstream readin{file_path, std::ios::in | std::ifstream::binary}; if (readin.fail()) { // NOTE: this does not warn because the file might not exist - // TODO: consider if this should explicilty check for the file's existence or not to throw + // TODO: consider if this should explicitly check for the file's existence or not to throw // an informative warning readin.close(); } else { @@ -1537,7 +1541,7 @@ NvrtcFunction jit_pwise_function( // Constructs nvrtc build arguments // CUDA 11.1 allows going directly to SASS (sm_) instead of PTX (compute_) // which gives better backwards compatibility to work on older driver, - // (since older driver doesn't necessrily recognize PTX emitted by new + // (since older driver doesn't necessarily recognize PTX emitted by new // toolkit); // Meanwhile, for forward compatibility (future device with // `unsupported_arch==True`), since SASS are not necessarily compatible, @@ -1565,11 +1569,9 @@ NvrtcFunction jit_pwise_function( if (compilation_result != NVRTC_SUCCESS) { size_t logsize; AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLogSize(program, &logsize)); - std::vector log(logsize); - AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, log.data())); - std::stringstream cu; - cu << log.data(); - throw std::runtime_error(code + cu.str()); + std::string log(logsize, '\0'); + AT_CUDA_NVRTC_CHECK(nvrtc.nvrtcGetProgramLog(program, &log[0])); + throw std::runtime_error(code + log); } size_t ptx_size = 0; diff --git a/aten/src/ATen/native/cuda/layer_norm_kernel.cu b/aten/src/ATen/native/cuda/layer_norm_kernel.cu index bc5190874ffee..6423dddbb2995 100644 --- a/aten/src/ATen/native/cuda/layer_norm_kernel.cu +++ b/aten/src/ATen/native/cuda/layer_norm_kernel.cu @@ -297,7 +297,7 @@ __device__ __inline__ void vectorized_layer_norm_kernel_impl( //to avoid windows SFINAE errors template -__global__ __inline__ void vectorized_layer_norm_kernel( +__global__ void vectorized_layer_norm_kernel( const int N, T_ACC eps, const T* __restrict__ X, @@ -393,7 +393,7 @@ __global__ void layer_norm_grad_input_kernel( // This implementation gets called when input buffers (dY, X, gamma and dX) are aligned // to vec_size * sizeof(T). Compared to the unvectorized implementation, it is about 10% -// faster measuread at PT operator level, with cases seeing a 2X speedup (where N >> M). +// faster measured at PT operator level, with cases seeing a 2X speedup (where N >> M). // There are no noticeable regressions on the rest of the sizes. template @@ -1149,12 +1149,12 @@ void LayerNormBackwardKernelImplInternal( file a support request to support bigger batches"); TORCH_CHECK(N <= std::numeric_limits::max(), "Normalized shape should have less than INT_MAX elements, \ file a support request to support bigger normalized shapes"); - const T* dY_data = dY.template data_ptr(); - const T* X_data = X.template data_ptr(); - const T_ACC* mean_data = mean.template data_ptr(); - const T_ACC* rstd_data = rstd.template data_ptr(); + const T* dY_data = dY.template const_data_ptr(); + const T* X_data = X.template const_data_ptr(); + const T_ACC* mean_data = mean.template const_data_ptr(); + const T_ACC* rstd_data = rstd.template const_data_ptr(); const T* gamma_data = - gamma.defined() ? gamma.template data_ptr() : nullptr; + gamma.defined() ? gamma.template const_data_ptr() : nullptr; T* dX_data = dX->defined() ? dX->template data_ptr() : nullptr; cudaStream_t cuda_stream = at::cuda::getCurrentCUDAStream(); const int warp_size = at::cuda::warp_size(); diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp index a08547dc21b6a..5471c57ec30ed 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebra.cpp @@ -1992,7 +1992,7 @@ void linalg_eigh_magma(const Tensor& eigenvalues, const Tensor& eigenvectors, co } void linalg_eigh_kernel(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) { -#if defined(USE_LINALG_SOLVER) && !defined(USE_ROCM) +#if defined(USE_LINALG_SOLVER) auto preferred_backend = at::globalContext().linalgPreferredBackend(); switch (preferred_backend) { case at::LinalgBackend::Magma: @@ -2427,7 +2427,7 @@ static void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor // magma implementation of LU solve cannot handle a b tensor with last dim > 1024 // See https://bitbucket.org/icl/magma/issues/19/dgesv_batched-dgetrs_batched-fails-for bool over_batched_magma_dim_limit = k > 1024; - // heuristics determined from tests dicussed in https://github.com/pytorch/pytorch/pull/72935 + // heuristics determined from tests discussed in https://github.com/pytorch/pytorch/pull/72935 // Computes X = U^{-1}L^{-1}P^T B via triangular solves // Helps mitigating the bugs in magma @@ -2443,7 +2443,7 @@ static void lu_solve_kernel(const Tensor& LU, const Tensor& pivots, const Tensor .resize_outputs(false) .declare_static_shape(pivots_->sizes(), /*squash_dim=*/pivots_->dim() - 1) .add_output(perm) - .add_input(*pivots_) + .add_const_input(*pivots_) .build(); unpack_pivots_stub(pivots_->device().type(), iter, n, n); diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp index ec65435d6c8df..06b095af4f66e 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLib.cpp @@ -135,11 +135,11 @@ void apply_ldl_solve_cusolver( auto b_stride = B.dim() > 2 ? B.stride(-3) : 0; auto pivots_stride = pivots.dim() > 1 ? pivots.stride(-2) : 0; - auto a_data = A.data_ptr(); + auto a_data = A.const_data_ptr(); auto b_data = B.data_ptr(); auto pivots_ = pivots.to(kLong); - auto pivots_data = pivots_.data_ptr(); + auto pivots_data = pivots_.const_data_ptr(); // needed to run ldl_solve tests in parallel // see https://github.com/pytorch/pytorch/issues/82894 for examples of failures @@ -175,9 +175,9 @@ void apply_ldl_solve_cusolver( Tensor info = at::zeros({}, A.options().dtype(at::kInt)); for (const auto i : c10::irange(batch_size)) { - auto* a_working_ptr = &a_data[i * a_stride]; + const auto* a_working_ptr = &a_data[i * a_stride]; auto* b_working_ptr = &b_data[i * b_stride]; - auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; + const auto* pivots_working_ptr = &pivots_data[i * pivots_stride]; TORCH_CUSOLVER_CHECK(cusolverDnXsytrs( handle, uplo, @@ -1078,8 +1078,8 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot auto side = left ? CUBLAS_SIDE_LEFT : CUBLAS_SIDE_RIGHT; auto trans = transpose ? (input.is_complex() ? CUBLAS_OP_C : CUBLAS_OP_T) : CUBLAS_OP_N; - auto input_data = input.data_ptr(); - auto tau_data = tau.data_ptr(); + auto input_data = input.const_data_ptr(); + auto tau_data = tau.const_data_ptr(); auto other_data = other.data_ptr(); auto input_matrix_stride = matrixStride(input); @@ -1101,9 +1101,9 @@ static void apply_ormqr(const Tensor& input, const Tensor& tau, const Tensor& ot auto info_data = info.data_ptr(); for (auto i = decltype(batch_size){0}; i < batch_size; i++) { - scalar_t* input_working_ptr = &input_data[i * input_matrix_stride]; + const scalar_t* input_working_ptr = &input_data[i * input_matrix_stride]; scalar_t* other_working_ptr = &other_data[i * other_matrix_stride]; - scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; + const scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; auto handle = at::cuda::getCurrentCUDASolverDnHandle(); // allocate workspace storage @@ -1149,7 +1149,7 @@ void ormqr_cusolver(const Tensor& input, const Tensor& tau, const Tensor& other, template inline static void apply_orgqr(Tensor& self, const Tensor& tau) { auto self_data = self.data_ptr(); - auto tau_data = tau.data_ptr(); + auto tau_data = tau.const_data_ptr(); auto self_matrix_stride = matrixStride(self); auto batchsize = cuda_int_cast(batchCount(self), "batch size"); auto m = cuda_int_cast(self.size(-2), "m"); @@ -1180,7 +1180,7 @@ inline static void apply_orgqr(Tensor& self, const Tensor& tau) { for (auto i = decltype(batchsize){0}; i < batchsize; i++) { scalar_t* self_working_ptr = &self_data[i * self_matrix_stride]; - scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; + const scalar_t* tau_working_ptr = &tau_data[i * tau_stride]; auto handle = at::cuda::getCurrentCUDASolverDnHandle(); // allocate workspace storage @@ -1434,8 +1434,12 @@ static void linalg_eigh_cusolver_syevj_batched(const Tensor& eigenvalues, const } void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors, const Tensor& infos, bool upper, bool compute_eigenvectors) { + // for ROCm's hipSolver, syevj is fastest. +#ifdef USE_ROCM + linalg_eigh_cusolver_syevj(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors); +#else if (use_cusolver_syevj_batched_ && batchCount(eigenvectors) > 1 && eigenvectors.size(-1) <= 32) { - // Use syevjBatched for batched matrix opertion when matrix size <= 32 + // Use syevjBatched for batched matrix operation when matrix size <= 32 // See https://github.com/pytorch/pytorch/pull/53040#issuecomment-788264724 linalg_eigh_cusolver_syevj_batched(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors); } else if (eigenvectors.scalar_type() == at::kFloat && eigenvectors.size(-1) >= 32 && eigenvectors.size(-1) <= 512) { @@ -1445,6 +1449,7 @@ void linalg_eigh_cusolver(const Tensor& eigenvalues, const Tensor& eigenvectors, } else { linalg_eigh_cusolver_syevd(eigenvalues, eigenvectors, infos, upper, compute_eigenvectors); } +#endif } // The 'apply_' word is used for templated by dtype functions that call an API routine diff --git a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp index 38e7b8dd3288b..2a9f46e6f73e7 100644 --- a/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp +++ b/aten/src/ATen/native/cuda/linalg/BatchLinearAlgebraLibBlas.cpp @@ -133,7 +133,7 @@ static void apply_lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots TORCH_INTERNAL_ASSERT(batchCount(LU) == batchCount(pivots.unsqueeze(-1)), "batch_size of LU and pivots must be the same"); const auto trans = to_cublas(transpose); - auto pivots_data = pivots.data_ptr(); + auto pivots_data = pivots.const_data_ptr(); auto batch_size = cuda_int_cast(batchCount(LU), "batch_size");; auto m = cuda_int_cast(LU.size(-2), "m"); auto nrhs = cuda_int_cast(B.size(-1), "nrhs"); @@ -142,12 +142,12 @@ static void apply_lu_solve_batched_cublas(const Tensor& LU, const Tensor& pivots Tensor lu_ptr_array = get_device_pointers(LU); Tensor b_ptr_array = get_device_pointers(B); - auto lu_ptr_array_data = reinterpret_cast(lu_ptr_array.data_ptr()); + auto lu_ptr_array_data = reinterpret_cast(lu_ptr_array.const_data_ptr()); auto b_ptr_array_data = reinterpret_cast(b_ptr_array.data_ptr()); auto handle = at::cuda::getCurrentCUDABlasHandle(); - at::cuda::blas::getrsBatched(handle, trans, m, nrhs, lu_ptr_array_data, - lda, pivots_data, b_ptr_array_data, lda, &info, batch_size); + at::cuda::blas::getrsBatched(handle, trans, m, nrhs, const_cast(lu_ptr_array_data), + lda, const_cast(pivots_data), b_ptr_array_data, lda, &info, batch_size); TORCH_INTERNAL_ASSERT_DEBUG_ONLY(info == 0); } @@ -218,6 +218,20 @@ static void apply_triangular_solve_batched(const Tensor& A, const Tensor& B, boo } void triangular_solve_batched_cublas(const Tensor& A, const Tensor& B, bool left, bool upper, TransposeType transpose, bool unitriangular) { + // Workaround the following a bug on CUDA < 12.1 + // RuntimeError: CUDA error: CUBLAS_STATUS_EXECUTION_FAILED when calling `cublasStrsmBatched + // See https://github.com/pytorch/pytorch/issues/79191#issuecomment-1154222580 +#if defined(CUSOLVER_VERSION) && CUSOLVER_VERSION < 12100 + constexpr auto max_batch_size = 524280; + if (B.size(-1) > max_batch_size) { + auto n_chunks = (B.size(-1) + max_batch_size - 1) / max_batch_size; // ceildiv + auto splits = B.split(n_chunks, /*dim=*/-1); + for (const Tensor& b : splits) { + triangular_solve_batched_cublas(A, b, left, upper, transpose, unitriangular); + } + return; + } +#endif AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES(A.scalar_type(), "triangular_solve_cuda", [&]{ apply_triangular_solve_batched(A, B, left, upper, transpose, unitriangular); }); diff --git a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp index bdb0d26a1b690..3016897c66c5d 100644 --- a/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp +++ b/aten/src/ATen/native/cuda/linalg/CusolverDnHandlePool.cpp @@ -29,7 +29,7 @@ using CuSolverDnPoolType = DeviceThreadHandlePool #include +#include #include #ifndef AT_PER_OPERATOR_HEADERS #include #include #else -#include -#include #include +#include +#include #endif #if !AT_CUDNN_ENABLED() -namespace at { namespace native { +namespace at { +namespace native { // See Note [ATen preprocessor philosophy] Tensor cudnn_affine_grid_generator_forward( const Tensor& theta, - int64_t N, int64_t C, int64_t H, int64_t W) { - AT_ERROR("cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); + int64_t N, + int64_t C, + int64_t H, + int64_t W) { + AT_ERROR( + "cudnn_affine_grid_generator_forward: ATen not compiled with cuDNN support"); } Tensor cudnn_affine_grid_generator_backward( const Tensor& grad_theta, - int64_t N, int64_t C, int64_t H, int64_t W) { - AT_ERROR("cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); + int64_t N, + int64_t C, + int64_t H, + int64_t W) { + AT_ERROR( + "cudnn_affine_grid_generator_backward: ATen not compiled with cuDNN support"); } -}} +} // namespace native +} // namespace at #else // AT_CUDNN_ENABLED() -#include -#include +#include #include +#include #include #include -#include +#include #include -namespace at { namespace native { +namespace at { +namespace native { namespace { -void setSamplerDescriptor(SpatialTransformerDescriptor& desc, - cudnnDataType_t dataType, - int N, int C, int H, int W) -{ +void setSamplerDescriptor( + SpatialTransformerDescriptor& desc, + cudnnDataType_t dataType, + int N, + int C, + int H, + int W) { int inputSize[4] = {N, C, H, W}; desc.set(dataType, 4, inputSize); } -} // namespace +} // namespace Tensor cudnn_affine_grid_generator_forward( const Tensor& theta_t, - int64_t N, int64_t C, int64_t H, int64_t W) -{ + int64_t N, + int64_t C, + int64_t H, + int64_t W) { auto theta_t_contig = theta_t.contiguous(); - TensorArg theta{ theta_t_contig, "theta", 1 }; + TensorArg theta{theta_t_contig, "theta", 1}; CheckedFrom c = "cudnn_affine_grid_generator_forward"; checkContiguous(c, theta); checkSize(c, theta, {N, 2, 3}); @@ -73,18 +89,19 @@ Tensor cudnn_affine_grid_generator_forward( auto dataType = getCudnnDataType(*theta); SpatialTransformerDescriptor desc; setSamplerDescriptor(desc, dataType, N, C, H, W); - AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward(getCudnnHandle(), desc.desc(), - theta->data_ptr(), - grid_t.data_ptr())); + AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorForward( + getCudnnHandle(), desc.desc(), theta->data_ptr(), grid_t.data_ptr())); return grid_t; } Tensor cudnn_affine_grid_generator_backward( const Tensor& grad_grid_t, - int64_t N, int64_t C, int64_t H, int64_t W) -{ + int64_t N, + int64_t C, + int64_t H, + int64_t W) { auto grad_grid_contig = grad_grid_t.contiguous(); - TensorArg grad_grid{ grad_grid_contig, "grad_grid", 1 }; + TensorArg grad_grid{grad_grid_contig, "grad_grid", 1}; CheckedFrom c = "cudnn_affine_grid_generator_backward"; checkContiguous(c, grad_grid); checkSize(c, grad_grid, {N, H, W, 2}); @@ -95,12 +112,15 @@ Tensor cudnn_affine_grid_generator_backward( auto dataType = getCudnnDataType(grad_theta_t); SpatialTransformerDescriptor desc; setSamplerDescriptor(desc, dataType, N, C, H, W); - AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward(getCudnnHandle(), desc.desc(), - grad_grid->data_ptr(), - grad_theta_t.data_ptr())); + AT_CUDNN_CHECK(cudnnSpatialTfGridGeneratorBackward( + getCudnnHandle(), + desc.desc(), + grad_grid->data_ptr(), + grad_theta_t.data_ptr())); return grad_theta_t; } -}} // namespace at::native +} // namespace native +} // namespace at #endif // AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/cudnn/BatchNorm.cpp b/aten/src/ATen/native/cudnn/BatchNorm.cpp index f18318fd0dcf8..44b004dff0007 100644 --- a/aten/src/ATen/native/cudnn/BatchNorm.cpp +++ b/aten/src/ATen/native/cudnn/BatchNorm.cpp @@ -1,36 +1,63 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include +#include #include +#ifdef __HIP_PLATFORM_AMD__ +#include +#else +#include +#endif + #if !AT_CUDNN_ENABLED() -namespace at { namespace native { +namespace at { +namespace native { // See Note [ATen preprocessor philosophy] std::tuple cudnn_batch_norm( - const Tensor& input, const Tensor& weight, const c10::optional& bias_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, - bool training, double exponential_average_factor, double epsilon) { + const Tensor& input, + const Tensor& weight, + const c10::optional& bias_opt, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + bool training, + double exponential_average_factor, + double epsilon) { AT_ERROR("cudnn_batch_norm: ATen not compiled with cuDNN support"); } std::tuple cudnn_batch_norm_backward( - const Tensor& input, const Tensor& grad_output, const Tensor& weight, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_var_opt, - double epsilon, const Tensor& reservedSpace) { + const Tensor& input, + const Tensor& grad_output, + const Tensor& weight, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, + const c10::optional& save_var_opt, + double epsilon, + const Tensor& reservedSpace) { AT_ERROR("cudnn_batch_norm_backward: ATen not compiled with cuDNN support"); } -}} // namespace at::native +size_t _get_cudnn_batch_norm_reserve_space_size( + const Tensor& input_t, + bool training) { + AT_ERROR( + "_get_cudnn_batch_norm_reserve_space_size: ATen not compiled with cuDNN support"); +} + +} // namespace native +} // namespace at #else // AT_CUDNN_ENABLED +#include +#include #include #include #include -#include - -#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -42,33 +69,29 @@ std::tuple cudnn_batch_norm_backward( #include #endif -namespace at { namespace native { +namespace at { +namespace native { namespace { Tensor expandScale(const Tensor& t, int64_t dim) { - std::vector size{ 1, t.numel() }; + std::vector size{1, t.numel()}; while (static_cast(size.size()) < dim) { size.emplace_back(1); } return t.view(size); } -cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memory_format, int64_t dim) { +cudnnBatchNormMode_t getCudnnBatchNormMode( + bool training, + at::MemoryFormat memory_format, + int64_t dim) { if (dim == 2) { return CUDNN_BATCHNORM_PER_ACTIVATION; } else if (training && memory_format == at::MemoryFormat::ChannelsLast) { - return CUDNN_BATCHNORM_SPATIAL_PERSISTENT; - } else if (training && memory_format == at::MemoryFormat::ChannelsLast3d) { - -#if CUDNN_VERSION >= 8100 return CUDNN_BATCHNORM_SPATIAL_PERSISTENT; -#else - return CUDNN_BATCHNORM_SPATIAL; -#endif // CUDNN_VERSION >= 8100 - } else { // TODO: The new CUDNN_BATCHNORM_SPATIAL_PERSISTENT mode was // introduced in CuDNN 7 for performance optimization, but it results in @@ -78,23 +101,44 @@ cudnnBatchNormMode_t getCudnnBatchNormMode(bool training, at::MemoryFormat memor } } -} // namespace +} // namespace + +size_t _get_cudnn_batch_norm_reserve_space_size( + const Tensor& input_t, + bool training) { + size_t reserve_size; + TensorArg input{input_t, "input", 1}; + TensorDescriptor idesc{*input, 4}; + auto handle = getCudnnHandle(); + cudnnBatchNormMode_t mode = getCudnnBatchNormMode( + training, input->suggest_memory_format(), input->dim()); + auto op = CUDNN_BATCHNORM_OPS_BN; + AT_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize( + handle, mode, op, nullptr, idesc.desc(), &reserve_size)); + return reserve_size; +} std::tuple cudnn_batch_norm( - const Tensor& input_t, const Tensor& weight_t, const c10::optional& bias_t_opt, const c10::optional& running_mean_t_opt, const c10::optional& running_var_t_opt, - bool training, double exponential_average_factor, double epsilon) -{ + const Tensor& input_t, + const Tensor& weight_t, + const c10::optional& bias_t_opt, + const c10::optional& running_mean_t_opt, + const c10::optional& running_var_t_opt, + bool training, + double exponential_average_factor, + double epsilon) { // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned bias_t_maybe_owned = at::borrow_from_optional_tensor(bias_t_opt); + c10::MaybeOwned bias_t_maybe_owned = + at::borrow_from_optional_tensor(bias_t_opt); const Tensor& bias_t = *bias_t_maybe_owned; - const Tensor& running_mean_t = c10::value_or_else(running_mean_t_opt, [] {return Tensor();}); - const Tensor& running_var_t = c10::value_or_else(running_var_t_opt, [] {return Tensor();}); - - TensorArg input{ input_t, "input", 1 }, - weight{ weight_t, "weight", 2 }, - bias{ bias_t, "bias", 3 }, - running_mean{ running_mean_t, "running_mean", 4 }, - running_var{ running_var_t, "running_var", 5 }; + const Tensor& running_mean_t = + c10::value_or_else(running_mean_t_opt, [] { return Tensor(); }); + const Tensor& running_var_t = + c10::value_or_else(running_var_t_opt, [] { return Tensor(); }); + + TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}, + bias{bias_t, "bias", 3}, running_mean{running_mean_t, "running_mean", 4}, + running_var{running_var_t, "running_var", 5}; CheckedFrom c = "cudnn_batch_norm"; checkAllDefined(c, {input, weight, bias}); @@ -122,19 +166,19 @@ std::tuple cudnn_batch_norm( } cudnnBatchNormMode_t mode = getCudnnBatchNormMode( - training, - input->suggest_memory_format(), - input->dim() - ); + training, input->suggest_memory_format(), input->dim()); - auto output_t = at::empty_like(*input, input->options(), input->suggest_memory_format()); + auto output_t = + at::empty_like(*input, input->options(), input->suggest_memory_format()); - TensorArg output{ output_t, "output", 0 }; + TensorArg output{output_t, "output", 0}; auto handle = getCudnnHandle(); auto dataType = getCudnnDataType(*input); - TensorDescriptor idesc{ *input, 4 }; // input descriptor - TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, bias, running_mean, etc. + TensorDescriptor idesc{*input, 4}; // input descriptor + TensorDescriptor wdesc{ + expandScale(*weight, input->dim()), + 4}; // descriptor for weight, bias, running_mean, etc. Constant one(dataType, 1); Constant zero(dataType, 0); @@ -143,10 +187,9 @@ std::tuple cudnn_batch_norm( Tensor reserve; if (training) { - int64_t num_features = input_t.size(1); - save_mean = at::empty({ num_features }, weight_t.options()); - save_var = at::empty({ num_features }, weight_t.options()); + save_mean = at::empty({num_features}, weight_t.options()); + save_var = at::empty({num_features}, weight_t.options()); auto op = CUDNN_BATCHNORM_OPS_BN; size_t workspace_size; @@ -163,14 +206,8 @@ std::tuple cudnn_batch_norm( Tensor workspace = at::empty(workspace_size, input->options().dtype(kByte)); // get the reserved size and allocate as tensor - size_t reserve_size; - AT_CUDNN_CHECK(cudnnGetBatchNormalizationTrainingExReserveSpaceSize( - handle, - mode, - op, - nullptr, - idesc.desc(), - &reserve_size)); + size_t reserve_size = + _get_cudnn_batch_norm_reserve_space_size(input_t, true /* training */); reserve = at::empty(reserve_size, input->options().dtype(kByte)); AT_CUDNN_CHECK(cudnnBatchNormalizationForwardTrainingEx( @@ -180,14 +217,14 @@ std::tuple cudnn_batch_norm( &one, &zero, idesc.desc(), - input->data_ptr(), - nullptr, // z descriptor for BN-Add-Relu - nullptr, // z for BN-Add-ReLU + input->const_data_ptr(), + nullptr, // z descriptor for BN-Add-Relu + nullptr, // z for BN-Add-ReLU idesc.desc(), output->data_ptr(), wdesc.desc(), - weight->data_ptr(), - bias->data_ptr(), + weight->const_data_ptr(), + bias->const_data_ptr(), exponential_average_factor, at::maybe_data_ptr(running_mean), at::maybe_data_ptr(running_var), @@ -205,21 +242,27 @@ std::tuple cudnn_batch_norm( save_mean = at::empty({0}, weight_t.options()); save_var = at::empty({0}, weight_t.options()); AT_CUDNN_CHECK(cudnnBatchNormalizationForwardInference( - handle, mode, &one, &zero, - idesc.desc(), input->data_ptr(), - idesc.desc(), output->data_ptr(), - wdesc.desc(), - weight->data_ptr(), - bias->data_ptr(), - running_mean->data_ptr(), - running_var->data_ptr(), - epsilon)); + handle, + mode, + &one, + &zero, + idesc.desc(), + input->const_data_ptr(), + idesc.desc(), + output->data_ptr(), + wdesc.desc(), + weight->const_data_ptr(), + bias->const_data_ptr(), + running_mean->const_data_ptr(), + running_var->const_data_ptr(), + epsilon)); } // save_mean and save_var can be undefined // If this causes problems, we can initialize them to empty tensors // of the correct type - return std::tuple{output_t, save_mean, save_var, reserve}; + return std::tuple{ + output_t, save_mean, save_var, reserve}; } // NB: CuDNN only implements the backward algorithm for batchnorm @@ -246,13 +289,13 @@ std::tuple cudnn_batch_norm_backward( // TODO: Is it worth it to have a contiguous call or maybe we should go with // whatever format is given here. - auto grad_output_contig = grad_output_t.contiguous(input_t.suggest_memory_format()); - TensorArg input{ input_t, "input", 1 }, - grad_output{ grad_output_contig, "grad_output", 2 }, - weight{ weight_t, "weight", 3 }, - save_mean{ save_mean_t, "save_mean", 4 }, - save_var{ save_var_t, "save_var", 5 }, - reserve{ reserveSpace, "reserve_space", 6 }; + auto grad_output_contig = + grad_output_t.contiguous(input_t.suggest_memory_format()); + TensorArg input{input_t, "input", 1}, + grad_output{grad_output_contig, "grad_output", 2}, + weight{weight_t, "weight", 3}, save_mean{save_mean_t, "save_mean", 4}, + save_var{save_var_t, "save_var", 5}, + reserve{reserveSpace, "reserve_space", 6}; CheckedFrom c = "cudnn_batch_norm_backward"; checkAllDefined(c, {input, grad_output, weight, save_mean, save_var}); @@ -277,21 +320,23 @@ std::tuple cudnn_batch_norm_backward( } cudnnBatchNormMode_t mode = getCudnnBatchNormMode( - true, // training - input->suggest_memory_format(), - input->dim() - ); + true, // training + input->suggest_memory_format(), + input->dim()); - auto grad_input_t = at::empty(input->sizes(), input->options(), input->suggest_memory_format()); + auto grad_input_t = at::empty( + input->sizes(), input->options(), input->suggest_memory_format()); auto grad_weight_t = at::empty(weight->sizes(), weight->options()); - auto grad_bias_t = at::empty(weight->sizes(), weight->options()); + auto grad_bias_t = at::empty(weight->sizes(), weight->options()); auto handle = getCudnnHandle(); auto dataType = getCudnnDataType(*input); - TensorDescriptor idesc{ *input, 4 }; // input, grad_output descriptor - TensorDescriptor odesc{ *grad_output, 4 }; // input, grad_output descriptor - TensorDescriptor wdesc{ expandScale(*weight, input->dim()), 4 }; // descriptor for weight, save_mean, etc. + TensorDescriptor idesc{*input, 4}; // input, grad_output descriptor + TensorDescriptor odesc{*grad_output, 4}; // input, grad_output descriptor + TensorDescriptor wdesc{ + expandScale(*weight, input->dim()), + 4}; // descriptor for weight, save_mean, etc. Constant one(dataType, 1); Constant zero(dataType, 0); @@ -314,28 +359,42 @@ std::tuple cudnn_batch_norm_backward( Tensor workspace = at::empty(workspace_size, input->options().dtype(kByte)); AT_CUDNN_CHECK(cudnnBatchNormalizationBackwardEx( - handle, mode, op, &one, &zero, &one, &zero, - idesc.desc(), input->data_ptr(), - nullptr, nullptr, - odesc.desc(), grad_output->data_ptr(), - nullptr, nullptr, - idesc.desc(), grad_input_t.data_ptr(), - wdesc.desc(), weight->data_ptr(), - nullptr, - grad_weight_t.data_ptr(), - grad_bias_t.data_ptr(), - epsilon, - save_mean->data_ptr(), - save_var->data_ptr(), - nullptr, - workspace.data_ptr(), - workspace_size, - reserve->data_ptr(), - reserve->numel())); - - return std::tuple{grad_input_t, grad_weight_t, grad_bias_t}; + handle, + mode, + op, + &one, + &zero, + &one, + &zero, + idesc.desc(), + input->const_data_ptr(), + nullptr, + nullptr, + odesc.desc(), + grad_output->const_data_ptr(), + nullptr, + nullptr, + idesc.desc(), + grad_input_t.data_ptr(), + wdesc.desc(), + weight->const_data_ptr(), + nullptr, + grad_weight_t.data_ptr(), + grad_bias_t.data_ptr(), + epsilon, + save_mean->const_data_ptr(), + save_var->const_data_ptr(), + nullptr, + workspace.data_ptr(), + workspace_size, + reserve->data_ptr(), + reserve->numel())); + + return std::tuple{ + grad_input_t, grad_weight_t, grad_bias_t}; } -}} // namespace native +} // namespace native +} // namespace at #endif diff --git a/aten/src/ATen/native/cudnn/BatchNorm.h b/aten/src/ATen/native/cudnn/BatchNorm.h new file mode 100644 index 0000000000000..3da76c0c16e41 --- /dev/null +++ b/aten/src/ATen/native/cudnn/BatchNorm.h @@ -0,0 +1,6 @@ +namespace at::native { + +TORCH_API size_t +_get_cudnn_batch_norm_reserve_space_size(const Tensor& input_t, bool training); + +} // namespace at::native diff --git a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp index f6362b828f4ca..8475a143f466c 100644 --- a/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp +++ b/aten/src/ATen/native/cudnn/ConvPlaceholders.cpp @@ -12,7 +12,8 @@ #include #endif -namespace at { namespace native { +namespace at { +namespace native { // --------------------------------------------------------------------- // @@ -25,89 +26,180 @@ namespace at { namespace native { // See Note [ATen preprocessor philosophy] at::Tensor cudnn_convolution( - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { AT_ERROR("cudnn_convolution: ATen not compiled with cuDNN support"); } at::Tensor& cudnn_convolution_out( - const Tensor& input_t, const Tensor& weight_t, IntArrayRef padding, - IntArrayRef stride, IntArrayRef dilation, int64_t groups, bool benchmark, - bool deterministic, bool allow_tf32, Tensor& output_t) { + const Tensor& input_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + Tensor& output_t) { AT_ERROR("cudnn_convolution_out: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_input( - IntArrayRef input_size, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); + IntArrayRef input_size, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "cudnn_convolution_backward_input: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_backward_weight( - IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); + IntArrayRef weight_size, + const at::Tensor& grad_output, + const at::Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "cudnn_convolution_backward_weight: ATen not compiled with cuDNN support"); } -std::tuple cudnn_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { +std::tuple cudnn_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + std::array output_mask) { AT_ERROR("cudnn_convolution_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose( - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { AT_ERROR("cudnn_convolution_transpose: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_input( - const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } at::Tensor cudnn_convolution_transpose_backward_weight( - IntArrayRef weight_size, const at::Tensor& grad_output, const at::Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); + IntArrayRef weight_size, + const at::Tensor& grad_output, + const at::Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "cudnn_convolution_transpose_backward_weight: ATen not compiled with cuDNN support"); } -std::tuple cudnn_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - AT_ERROR("cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, + const at::Tensor& grad_output, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + std::array output_mask) { + AT_ERROR( + "cudnn_convolution_transpose_backward: ATen not compiled with cuDNN support"); } void raw_cudnn_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("raw_cudnn_convolution_forward_out: ATen not compiled with cuDNN support"); + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "raw_cudnn_convolution_forward_out: ATen not compiled with cuDNN support"); } void raw_cudnn_convolution_backward_input_out( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("raw_cudnn_convolution_backward_input_out: ATen not compiled with cuDNN support"); + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "raw_cudnn_convolution_backward_input_out: ATen not compiled with cuDNN support"); } void raw_cudnn_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - AT_ERROR("raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support"); + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + AT_ERROR( + "raw_cudnn_convolution_backward_weight_out: ATen not compiled with cuDNN support"); } Tensor cudnn_convolution_relu( @@ -134,6 +226,7 @@ Tensor cudnn_convolution_add_relu( AT_ERROR("cudnn_convolution_add_relu: ATen not compiled with cuDNN support"); } -#endif // AT_CUDNN_ENABLED +#endif // AT_CUDNN_ENABLED -}} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cudnn/ConvShared.cpp b/aten/src/ATen/native/cudnn/ConvShared.cpp index 3a615806e50a9..104ae8c70803d 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.cpp +++ b/aten/src/ATen/native/cudnn/ConvShared.cpp @@ -1,10 +1,10 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include -#include #include #include +#include #include +#include #include #if AT_CUDNN_ENABLED() @@ -30,7 +30,7 @@ // ConvPlaceholders.cpp contains placeholder implementation of cudnn // convolution when cudnn is not enabled. These operators only raises // errors, and do no real computation. These operators are implemented -// using currnet operators. +// using current operators. // // cuDNN v7 and v8 have different API. ConvShared.{cpp, h} contains // code shared by v7 and v8. Conv_v7.cpp contains implementation of @@ -54,7 +54,7 @@ // Function that has different implementation on Conv_v7.cpp // and Conv_v8.cpp // -// The raw API directly invokes CuDNN and are implemeted differently +// The raw API directly invokes CuDNN and are implemented differently // on cuDNN v7 and cuDNN v8 // // There are a few reasons this should never be directly exposed @@ -71,7 +71,8 @@ // - Things that happen in TensorArg // - Check arguments (type, GPU, shape) -namespace at { namespace native { +namespace at { +namespace native { // --------------------------------------------------------------------- // @@ -79,16 +80,17 @@ namespace at { namespace native { // // --------------------------------------------------------------------- -std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) { +std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params) { out << "ConvolutionParams \n" - << " memory_format = " << params.memory_format << "\n" - << " data_type = " << cudnnTypeToString(params.dataType) << "\n" - << " padding = " << ArrayRef{params.padding} << "\n" - << " stride = " << ArrayRef{params.stride} << "\n" - << " dilation = " << ArrayRef{params.dilation} << "\n" - << " groups = " << params.groups << "\n" - << " deterministic = " << (params.deterministic ? "true" : "false") << "\n" - << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; + << " memory_format = " << params.memory_format << "\n" + << " data_type = " << cudnnTypeToString(params.dataType) << "\n" + << " padding = " << ArrayRef{params.padding} << "\n" + << " stride = " << ArrayRef{params.stride} << "\n" + << " dilation = " << ArrayRef{params.dilation} << "\n" + << " groups = " << params.groups << "\n" + << " deterministic = " << (params.deterministic ? "true" : "false") + << "\n" + << " allow_tf32 = " << (params.allow_tf32 ? "true" : "false") << "\n"; return out; } @@ -100,10 +102,15 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params) { // grad_input/grad_output, so this is not very pressing) void setConvolutionParams( ConvolutionParams* params, - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool deterministic, bool allow_tf32, at::MemoryFormat memory_format) { - + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool deterministic, + bool allow_tf32, + at::MemoryFormat memory_format) { cudnnDataType_t dataType = getCudnnDataType(input); memset(params, 0, sizeof(ConvolutionParams)); params->device_id = at::cuda::current_device(); @@ -112,8 +119,8 @@ void setConvolutionParams( params->input_dim = input.dim(); params->memory_format = memory_format; for (int i = 0; i != params->input_dim; ++i) { - params->input_size[i] = (int) input.sizes()[i]; - params->weight_size[i] = (int) weight.sizes()[i]; + params->input_size[i] = (int)input.sizes()[i]; + params->weight_size[i] = (int)weight.sizes()[i]; } // ASSERT(padding.size() == stride.size()) // ASSERT(padding.size() == dilation.size()) @@ -133,37 +140,55 @@ std::string repro_from_args(const ConvolutionParams& params) { auto pybool = [](bool b) -> const char* { return b ? "True" : "False"; }; std::string partial_dtype; switch (params.dataType) { - case CUDNN_DATA_FLOAT: partial_dtype = "float"; break; - case CUDNN_DATA_DOUBLE: partial_dtype = "double"; break; - case CUDNN_DATA_HALF: partial_dtype = "half"; break; - default: partial_dtype = "unsupported"; + case CUDNN_DATA_FLOAT: + partial_dtype = "float"; + break; + case CUDNN_DATA_DOUBLE: + partial_dtype = "double"; + break; + case CUDNN_DATA_HALF: + partial_dtype = "half"; + break; + default: + partial_dtype = "unsupported"; } const std::string full_dtype = "torch." + partial_dtype; const int out_channels = params.weight_size[0]; const int in_channels = params.weight_size[1] * params.groups; const size_t dim = params.input_dim; - const std::string channels_last_xd = dim == 4 ? "channels_last" : "channels_last_3d"; + const std::string channels_last_xd = + dim == 4 ? "channels_last" : "channels_last_3d"; const std::string to_channels_last = - ((params.memory_format == at::MemoryFormat::ChannelsLast) || (params.memory_format == at::MemoryFormat::ChannelsLast3d)) \ - ? ".to(memory_format=torch." + channels_last_xd + ")" : ""; + ((params.memory_format == at::MemoryFormat::ChannelsLast) || + (params.memory_format == at::MemoryFormat::ChannelsLast3d)) + ? ".to(memory_format=torch." + channels_last_xd + ")" + : ""; std::ostringstream ss; ss << "You can try to repro this exception using the following code snippet. "; ss << "If that doesn't trigger the error, please include your original repro script when reporting this issue.\n\n"; ss << "import torch\n"; - ss << "torch.backends.cuda.matmul.allow_tf32 = " << pybool(at::globalContext().allowTF32CuBLAS()) << "\n"; - ss << "torch.backends.cudnn.benchmark = " << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; - ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic) << "\n"; - ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32) << "\n"; - ss << "data = torch.randn(" << ArrayRef(params.input_size, dim) << ", dtype=" << full_dtype << ", "; - ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; - ss << "net = torch.nn.Conv" << dim-2 << "d(" << in_channels << ", " << out_channels << ", "; - ss << "kernel_size=" << ArrayRef(¶ms.weight_size[2], dim - 2) << ", "; - ss << "padding=" << ArrayRef(params.padding, dim-2) << ", "; - ss << "stride=" << ArrayRef(params.stride, dim-2) << ", "; - ss << "dilation=" << ArrayRef(params.dilation, dim-2) << ", "; - ss << "groups=" << params.groups << ")\n"; - ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last << "\n"; + ss << "torch.backends.cuda.matmul.allow_tf32 = " + << pybool(at::globalContext().allowTF32CuBLAS()) << "\n"; + ss << "torch.backends.cudnn.benchmark = " + << pybool(at::globalContext().benchmarkCuDNN()) << "\n"; + ss << "torch.backends.cudnn.deterministic = " << pybool(params.deterministic) + << "\n"; + ss << "torch.backends.cudnn.allow_tf32 = " << pybool(params.allow_tf32) + << "\n"; + ss << "data = torch.randn(" << ArrayRef(params.input_size, dim) + << ", dtype=" << full_dtype << ", "; + ss << "device='cuda', requires_grad=True)" << to_channels_last << "\n"; + ss << "net = torch.nn.Conv" << dim - 2 << "d(" << in_channels << ", " + << out_channels << ", "; + ss << "kernel_size=" << ArrayRef(¶ms.weight_size[2], dim - 2) + << ", "; + ss << "padding=" << ArrayRef(params.padding, dim - 2) << ", "; + ss << "stride=" << ArrayRef(params.stride, dim - 2) << ", "; + ss << "dilation=" << ArrayRef(params.dilation, dim - 2) << ", "; + ss << "groups=" << params.groups << ")\n"; + ss << "net = net.cuda()." << partial_dtype << "()" << to_channels_last + << "\n"; ss << "out = net(data)\n"; ss << "out.backward(torch.randn_like(out))\n"; ss << "torch.cuda.synchronize()\n\n"; @@ -339,10 +364,16 @@ Tensor cudnn_convolution_transpose_backward_input( Tensor cudnn_convolution_backward_input( CheckedFrom c, - IntArrayRef input_size, const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ + IntArrayRef input_size, + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { checkAllSameType(c, {grad_output, weight}); checkAllSameGPU(c, {grad_output, weight}); @@ -351,54 +382,114 @@ Tensor cudnn_convolution_backward_input( input_size, grad_output->options().memory_format(memory_format)); // Avoid "grad_input" when this is being used as transposed convolution - TensorArg grad_input{ grad_input_t, "result", 0 }; - convolution_shape_check(c, grad_input, weight, grad_output, padding, stride, dilation, groups); + TensorArg grad_input{grad_input_t, "result", 0}; + convolution_shape_check( + c, grad_input, weight, grad_output, padding, stride, dilation, groups); Tensor weight_contig = weight->contiguous(memory_format); Tensor grad_output_contig = grad_output->contiguous(memory_format); raw_cudnn_convolution_backward_input_out( - *grad_input, grad_output_contig, weight_contig, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + *grad_input, + grad_output_contig, + weight_contig, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); return *grad_input; } Tensor cudnn_convolution_transpose_forward( CheckedFrom c, - const TensorArg& grad_output, const TensorArg& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - auto input_size = conv_input_size(grad_output->sizes(), weight->sizes(), - padding, output_padding, stride, dilation, groups); - return cudnn_convolution_backward_input(c, input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + const TensorArg& grad_output, + const TensorArg& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + auto input_size = conv_input_size( + grad_output->sizes(), + weight->sizes(), + padding, + output_padding, + stride, + dilation, + groups); + return cudnn_convolution_backward_input( + c, + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } Tensor cudnn_convolution_backward_input( - IntArrayRef input_size, const Tensor& grad_output_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg grad_output{ grad_output_t, "grad_output", 1 }, - weight{ weight_t, "weight", 2 }; + IntArrayRef input_size, + const Tensor& grad_output_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + TensorArg grad_output{grad_output_t, "grad_output", 1}, + weight{weight_t, "weight", 2}; return cudnn_convolution_backward_input( "cudnn_convolution_backward_input", - input_size, grad_output, weight, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + input_size, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } Tensor cudnn_convolution_transpose( - const Tensor& input_t, const Tensor& weight_t, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool benchmark, bool deterministic, bool allow_tf32) -{ - TensorArg input { input_t, "input", 1 }, - weight { weight_t, "weight", 2 }; + const Tensor& input_t, + const Tensor& weight_t, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + TensorArg input{input_t, "input", 1}, weight{weight_t, "weight", 2}; CheckedFrom c = "cudnn_convolution_transpose"; auto output_t = cudnn_convolution_transpose_forward( - c, input, weight, padding, output_padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + c, + input, + weight, + padding, + output_padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); return output_t; } @@ -410,31 +501,54 @@ Tensor cudnn_convolution_transpose( Tensor cudnn_convolution_backward_weight( CheckedFrom c, - IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ + IntArrayRef weight_size, + const Tensor& grad_output_t, + const Tensor& input_t, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { auto layout = cudnn_conv_suggest_memory_format(input_t, grad_output_t); Tensor grad_output_contig_t = grad_output_t.contiguous(layout); - TensorArg grad_output_contig{ grad_output_contig_t, "grad_output", 1 }; + TensorArg grad_output_contig{grad_output_contig_t, "grad_output", 1}; Tensor input_contig_t = input_t.contiguous(layout); - TensorArg input{ input_contig_t, "input", 2}; + TensorArg input{input_contig_t, "input", 2}; checkAllSameType(c, {grad_output_contig, input}); checkAllSameGPU(c, {grad_output_contig, input}); - auto grad_weight_t = at::empty(weight_size, grad_output_contig->options(), layout); + auto grad_weight_t = + at::empty(weight_size, grad_output_contig->options(), layout); // For uniformity with everything else, although it seems grad_weight // would be unambiguous too. - TensorArg grad_weight{ grad_weight_t, "result", 0 }; - convolution_shape_check(c, input, grad_weight, grad_output_contig, padding, stride, dilation, groups); + TensorArg grad_weight{grad_weight_t, "result", 0}; + convolution_shape_check( + c, + input, + grad_weight, + grad_output_contig, + padding, + stride, + dilation, + groups); raw_cudnn_convolution_backward_weight_out( - *grad_weight, *grad_output_contig, *input, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + *grad_weight, + *grad_output_contig, + *input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); return grad_weight_t; } @@ -443,20 +557,39 @@ Tensor cudnn_convolution_backward_weight( IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { return cudnn_convolution_backward_weight( "cudnn_convolution_backward_weight", - weight_size, grad_output_t, input_t, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + weight_size, + grad_output_t, + input_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } -std::tuple cudnn_convolution_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - +std::tuple cudnn_convolution_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + std::array output_mask) { Tensor grad_output = grad_output_t.to(input.suggest_memory_format()); Tensor grad_input, grad_weight; @@ -469,45 +602,104 @@ std::tuple cudnn_convolution_backward( } } else { if (output_mask[0]) { - grad_input = cudnn_convolution_backward_input(input.sizes(), grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + grad_input = cudnn_convolution_backward_input( + input.sizes(), + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } if (output_mask[1]) { - grad_weight = cudnn_convolution_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + grad_weight = cudnn_convolution_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } } - return std::tuple{grad_input, grad_weight}; + return std::tuple{grad_input, grad_weight}; } Tensor cudnn_convolution_transpose_backward_weight( IntArrayRef weight_size, const Tensor& grad_output_t, const Tensor& input_t, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) -{ + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { return cudnn_convolution_backward_weight( "cudnn_convolution_backward_weight", - weight_size, input_t, grad_output_t, - padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + weight_size, + input_t, + grad_output_t, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } -std::tuple cudnn_convolution_transpose_backward( - const at::Tensor& input, const at::Tensor& grad_output_t, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef output_padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, std::array output_mask) { - +std::tuple cudnn_convolution_transpose_backward( + const at::Tensor& input, + const at::Tensor& grad_output_t, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef output_padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + std::array output_mask) { Tensor grad_output = grad_output_t.contiguous(input.suggest_memory_format()); Tensor grad_input, grad_weight; if (output_mask[0]) { - grad_input = cudnn_convolution_transpose_backward_input(grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + grad_input = cudnn_convolution_transpose_backward_input( + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } if (output_mask[1]) { - grad_weight = cudnn_convolution_transpose_backward_weight(weight.sizes(), grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + grad_weight = cudnn_convolution_transpose_backward_weight( + weight.sizes(), + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } - return std::tuple{grad_input, grad_weight}; + return std::tuple{grad_input, grad_weight}; } Tensor cudnn_convolution_relu( @@ -535,31 +727,14 @@ Tensor cudnn_convolution_relu( bool benchmark = ctx.benchmarkCuDNN(); bool allow_tf32 = ctx.allowTF32CuDNN(); auto _bias = bias_t.has_value() - ? bias_t.value() - : at::zeros( - {output_t.size(1)}, - optTypeMetaToScalarType(output_t.options().dtype_opt()), - output_t.options().layout_opt(), - output_t.options().device_opt(), - output_t.options().pinned_memory_opt()); - -#ifdef AT_CUDNN_CONV_BIAS_RELU_FALLBACK - raw_cudnn_convolution_add_relu_fallback_out( - output_t, - input, - weight, - output_t, // use output_t as z to satisfy CUDNN API - 0, // alpha - _bias, - stride, - padding, - dilation, - groups, - benchmark, // benchmark - false, // deterministic - allow_tf32 // allow_tf32 - ); -#else // AT_CUDNN_CONV_BIAS_RELU_FALLBACK + ? bias_t.value() + : at::zeros( + {output_t.size(1)}, + optTypeMetaToScalarType(output_t.options().dtype_opt()), + output_t.options().layout_opt(), + output_t.options().device_opt(), + output_t.options().pinned_memory_opt()); + raw_cudnn_convolution_add_relu_out( output_t, input, @@ -573,9 +748,8 @@ Tensor cudnn_convolution_relu( groups, benchmark, // benchmark false, // deterministic - allow_tf32 // allow_tf32 + allow_tf32 // allow_tf32 ); -#endif return output_t; } @@ -613,31 +787,14 @@ Tensor cudnn_convolution_add_relu( bool benchmark = ctx.benchmarkCuDNN(); auto _alpha = alpha.has_value() ? alpha.value().to() : 1.0; auto _bias = bias_t.has_value() - ? bias_t.value() - : at::zeros( - {output_t.size(1)}, - optTypeMetaToScalarType(output_t.options().dtype_opt()), - output_t.options().layout_opt(), - output_t.options().device_opt(), - output_t.options().pinned_memory_opt()); - -#ifdef AT_CUDNN_CONV_BIAS_RELU_FALLBACK - raw_cudnn_convolution_add_relu_fallback_out( - output_t, - input, - weight, - z, - _alpha, - _bias, - stride, - padding, - dilation, - groups, - benchmark, - false, // deterministic - allow_tf32 // allow_tf32 - ); -#else // AT_CUDNN_CONV_BIAS_RELU_FALLBACK + ? bias_t.value() + : at::zeros( + {output_t.size(1)}, + optTypeMetaToScalarType(output_t.options().dtype_opt()), + output_t.options().layout_opt(), + output_t.options().device_opt(), + output_t.options().pinned_memory_opt()); + raw_cudnn_convolution_add_relu_out( output_t, input, @@ -651,16 +808,20 @@ Tensor cudnn_convolution_add_relu( groups, benchmark, false, // deterministic - allow_tf32 // allow_tf32 + allow_tf32 // allow_tf32 ); -#endif // AT_CUDNN_CONV_BIAS_RELU_FALLBACK return output_t; } -REGISTER_CUDA_DISPATCH(cudnn_convolution_backward_stub, &cudnn_convolution_backward); -REGISTER_CUDA_DISPATCH(cudnn_convolution_transpose_backward_stub, &cudnn_convolution_transpose_backward); +REGISTER_CUDA_DISPATCH( + cudnn_convolution_backward_stub, + &cudnn_convolution_backward); +REGISTER_CUDA_DISPATCH( + cudnn_convolution_transpose_backward_stub, + &cudnn_convolution_transpose_backward); -}} +} // namespace native +} // namespace at -#endif // AT_CUDNN_ENABLED +#endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/cudnn/ConvShared.h b/aten/src/ATen/native/cudnn/ConvShared.h index 89986adadac1f..ae68bfc7d20d6 100644 --- a/aten/src/ATen/native/cudnn/ConvShared.h +++ b/aten/src/ATen/native/cudnn/ConvShared.h @@ -1,16 +1,13 @@ #pragma once #include -#include #include #include +#include #include -#if CUDNN_VERSION < 8000 -#define AT_CUDNN_CONV_BIAS_RELU_FALLBACK -#endif - -namespace at { namespace native { +namespace at { +namespace native { // --------------------------------------------------------------------- // @@ -20,8 +17,7 @@ namespace at { namespace native { // This POD struct is used to let us easily compute hashes of the // parameters -struct ConvolutionParams -{ +struct ConvolutionParams { c10::DeviceIndex device_id; cudnnDataType_t dataType; int input_size[2 + max_dim]; @@ -38,7 +34,7 @@ struct ConvolutionParams // forward and backward, so you can reuse the benchmark entry, }; -std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params); +std::ostream& operator<<(std::ostream& out, const ConvolutionParams& params); // NB: This can't be a constructor, because then ConvolutionParams // would not be a POD anymore. @@ -47,13 +43,18 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionParams& params); // grad_input/grad_output, so this is not very pressing) void setConvolutionParams( ConvolutionParams* params, - const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups, bool deterministic, bool allow_tf32, at::MemoryFormat memory_format); + const at::Tensor& input, + const at::Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool deterministic, + bool allow_tf32, + at::MemoryFormat memory_format); std::string repro_from_args(const ConvolutionParams& args); - // --------------------------------------------------------------------- // // Raw functions @@ -61,21 +62,40 @@ std::string repro_from_args(const ConvolutionParams& args); // --------------------------------------------------------------------- void raw_cudnn_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_backward_input_out( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_add_relu_out( const Tensor& output, @@ -107,7 +127,6 @@ void raw_cudnn_convolution_add_relu_fallback_out( bool deterministic, bool allow_tf32); - #if AT_CUDNN_ENABLED() // v7 functions are preserved here to allow for runtime switching to v7 @@ -116,21 +135,40 @@ void raw_cudnn_convolution_add_relu_fallback_out( // versions, as v7 explicitly splits large tensors as a 32-bit indexing // workaround whereas v8 expects cuDNN to handle large tensors. void raw_cudnn_convolution_forward_out_v7( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_backward_input_out_v7( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_backward_weight_out_v7( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32); + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32); void raw_cudnn_convolution_add_relu_out_v7( const Tensor& output, @@ -147,4 +185,5 @@ void raw_cudnn_convolution_add_relu_out_v7( bool deterministic, bool allow_tf32); #endif -}} +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cudnn/Conv_v7.cpp b/aten/src/ATen/native/cudnn/Conv_v7.cpp index ef3a70a2232f4..1c5b6fb94a221 100644 --- a/aten/src/ATen/native/cudnn/Conv_v7.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v7.cpp @@ -1,5 +1,5 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include // for the definition of AT_CUDNN_ENABLED +#include // for the definition of AT_CUDNN_ENABLED #if AT_CUDNN_ENABLED() @@ -13,12 +13,12 @@ #include #endif -#include -#include #include -#include #include #include +#include +#include +#include #include #include @@ -27,43 +27,48 @@ #include #include +#include +#include #include #include -#include -#include #include #include -#include +#include #include // Note [behavior of cudnnFind and cudnnGet] -// You'll notice that by default, in the ConvolutionDescriptor, we do the following: +// You'll notice that by default, in the ConvolutionDescriptor, we do the +// following: // -// AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_DEFAULT_MATH)); -// if(dataType == CUDNN_DATA_HALF) -// AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), CUDNN_TENSOR_OP_MATH)); +// AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), +// CUDNN_DEFAULT_MATH)); if(dataType == CUDNN_DATA_HALF) +// AT_CUDNN_CHECK(cudnnSetConvolutionMathType(mut_desc(), +// CUDNN_TENSOR_OP_MATH)); // // Update: AT_CUDNN_CHECK is updated with AT_CUDNN_CHECK_WITH_SHAPES, which -// automatically prints tensor shapes and convolution parameters if there is -// a cuDNN exception thrown. +// automatically prints tensor shapes and convolution parameters if there +// is a cuDNN exception thrown. // -// When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it informs -// cudnnGet/cudnnFind to iterate/take into account both tensor core and non-tensor-core algos. -// If you don't call cudnnSetConvolutionMathType before calling cudnnGet/cudnnFind, -// cudnnGet/cudnnFind may not pick tensor core algos. +// When cudnnSetConvolutionMathType is called before cudnnGet/cudnnFind, it +// informs cudnnGet/cudnnFind to iterate/take into account both tensor core and +// non-tensor-core algos. If you don't call cudnnSetConvolutionMathType before +// calling cudnnGet/cudnnFind, cudnnGet/cudnnFind may not pick tensor core +// algos. // -// Now after its run, cudnnGet/cudnnFind comes up with the best pair of algo+mathType -// with all the initial knowledge its given. It then becomes the user's responsibility -// to update mathType of the convolution descriptor and call the subsequent cudnn calls with -// the best algo and the updated descriptor. If we don't update the descriptor but just run -// with the best algo, under the hood, cudnn will run with the slower kernel -// since it sees fastest algorithm combination with a sub optimal mathType. - -constexpr size_t operator "" _TiB(unsigned long long n) { +// Now after its run, cudnnGet/cudnnFind comes up with the best pair of +// algo+mathType with all the initial knowledge its given. It then becomes the +// user's responsibility to update mathType of the convolution descriptor and +// call the subsequent cudnn calls with the best algo and the updated +// descriptor. If we don't update the descriptor but just run with the best +// algo, under the hood, cudnn will run with the slower kernel since it sees +// fastest algorithm combination with a sub optimal mathType. + +constexpr size_t operator"" _TiB(unsigned long long n) { return size_t(n) * 1024 * 1024 * 1024 * 1024; } -namespace at { namespace native { +namespace at { +namespace native { // Convenience struct for passing around descriptors and data // pointers @@ -72,23 +77,27 @@ struct ConvolutionArgs { ConvolutionParams params; TensorDescriptor idesc, odesc; FilterDescriptor wdesc; - const Tensor& input, output, weight; + const Tensor &input, output, weight; ConvolutionDescriptor cdesc; - ConvolutionArgs(const Tensor& input, const Tensor& output, const Tensor& weight) : input(input), output(output), weight(weight) { - } + ConvolutionArgs( + const Tensor& input, + const Tensor& output, + const Tensor& weight) + : input(input), output(output), weight(weight) {} }; -std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) { - out << repro_from_args(args.params) // already has a trailing newline - << args.params // already has a trailing newline - << "input: " << args.idesc // already has a trailing newline - << "output: " << args.odesc // already has a trailing newline - << "weight: " << args.wdesc // already has a trailing newline - << "Pointer addresses: " << "\n" - << " input: " << args.input.data_ptr() << "\n" - << " output: " << args.output.data_ptr() << "\n" - << " weight: " << args.weight.data_ptr() << "\n"; +std::ostream& operator<<(std::ostream& out, const ConvolutionArgs& args) { + out << repro_from_args(args.params) // already has a trailing newline + << args.params // already has a trailing newline + << "input: " << args.idesc // already has a trailing newline + << "output: " << args.odesc // already has a trailing newline + << "weight: " << args.wdesc // already has a trailing newline + << "Pointer addresses: " + << "\n" + << " input: " << args.input.const_data_ptr() << "\n" + << " output: " << args.output.const_data_ptr() << "\n" + << " weight: " << args.weight.const_data_ptr() << "\n"; return out; } @@ -103,7 +112,12 @@ std::ostream& operator<<(std::ostream & out, const ConvolutionArgs& args) { template struct BenchmarkCache { std::mutex mutex; - std::unordered_map, ParamsEqual> map; + std::unordered_map< + ConvolutionParams, + T, + ParamsHash, + ParamsEqual> + map; bool find(const ConvolutionParams& params, T* results) { std::lock_guard guard(mutex); @@ -129,10 +143,11 @@ BenchmarkCache bwd_filter_algos; // tensor instead. struct Workspace { Workspace(size_t size) : size(size), data(NULL) { - // Sometimes cuDNN returns a workspace size > 2^63, this could makes the allocation of - // workspace fail with some 64bit indexing error instead of an OOM error. In such case, - // we manually fail with OOM. - TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!"); + // Sometimes cuDNN returns a workspace size > 2^63, this could makes the + // allocation of workspace fail with some 64bit indexing error instead of an + // OOM error. In such case, we manually fail with OOM. + TORCH_CHECK_WITH( + OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!"); data = c10::cuda::CUDACachingAllocator::raw_alloc(size); } Workspace(const Workspace&) = delete; @@ -148,78 +163,80 @@ struct Workspace { void* data; }; -template -struct algorithm_search { -}; +template +struct algorithm_search {}; cudnnStatus_t getWorkspaceSize( const ConvolutionArgs& args, - cudnnConvolutionFwdAlgo_t algo, size_t* sz) -{ - return cudnnGetConvolutionForwardWorkspaceSize( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - algo, - sz - ); + cudnnConvolutionFwdAlgo_t algo, + size_t* sz) { + return cudnnGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + sz); } cudnnStatus_t getWorkspaceSize( const ConvolutionArgs& args, - cudnnConvolutionBwdDataAlgo_t algo, size_t* sz) -{ - return cudnnGetConvolutionBackwardDataWorkspaceSize( - args.handle, - args.wdesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.idesc.desc(), - algo, - sz); + cudnnConvolutionBwdDataAlgo_t algo, + size_t* sz) { + return cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + algo, + sz); } cudnnStatus_t getWorkspaceSize( const ConvolutionArgs& args, - cudnnConvolutionBwdFilterAlgo_t algo, size_t* sz) -{ - return cudnnGetConvolutionBackwardFilterWorkspaceSize( - args.handle, - args.idesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.wdesc.desc(), - algo, - sz); + cudnnConvolutionBwdFilterAlgo_t algo, + size_t* sz) { + return cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + algo, + sz); } -template +template size_t getMaxWorkspaceSize( const ConvolutionArgs& args, - const algo_t *algo, int n_algo) -{ + const algo_t* algo, + int n_algo) { size_t max_ws_size = 0; size_t max_block_size = 0; const auto device = c10::cuda::current_device(); // For the native allocator, retrieves the size of the largest unused block. - // For cudaMallocAsync, see c10/cuda/CUDAMallocAsync.cpp:cacheInfo for details. + // For cudaMallocAsync, see c10/cuda/CUDAMallocAsync.cpp:cacheInfo for + // details. c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size); for (const auto i : c10::irange(n_algo)) { cudnnStatus_t err; size_t sz; err = getWorkspaceSize(args, algo[i], &sz); - if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || sz > max_block_size) + if (CUDNN_STATUS_SUCCESS != err || sz == 0 || sz < max_ws_size || + sz > max_block_size) continue; max_ws_size = sz; } return max_ws_size; } -template -std::vector getValidAlgorithms(perf_t *perfResults, const ConvolutionArgs& args, int n_algo) { - +template +std::vector getValidAlgorithms( + perf_t* perfResults, + const ConvolutionArgs& args, + int n_algo) { std::vector result; result.reserve(n_algo); for (const auto i : c10::irange(n_algo)) { @@ -228,170 +245,203 @@ std::vector getValidAlgorithms(perf_t *perfResults, const ConvolutionArg // TODO: Shouldn't all returned results be successful? // Double check documentation for cudnnFindConvolutionForwardAlgorithmEx if (perf.status == CUDNN_STATUS_SUCCESS) { - if (!args.params.deterministic || perf.determinism == CUDNN_DETERMINISTIC) { - + if (!args.params.deterministic || + perf.determinism == CUDNN_DETERMINISTIC) { result.push_back(perf); } } } - TORCH_CHECK(result.size() > 0, "no valid convolution algorithms available in CuDNN"); + TORCH_CHECK( + result.size() > 0, "no valid convolution algorithms available in CuDNN"); return result; } -template<> +template <> struct algorithm_search { using perf_t = cudnnConvolutionFwdAlgoPerf_t; using algo_t = cudnnConvolutionFwdAlgo_t; - static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - static BenchmarkCache& cache() { return fwd_algos; } + static constexpr auto DEFAULT_ALGO = + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + static BenchmarkCache& cache() { + return fwd_algos; + } - static std::vector findAlgorithms(const ConvolutionArgs& args, bool benchmark) { + static std::vector findAlgorithms( + const ConvolutionArgs& args, + bool benchmark) { static const algo_t algos[] = { - CUDNN_CONVOLUTION_FWD_ALGO_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_FFT, - CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, - CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, - CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, + CUDNN_CONVOLUTION_FWD_ALGO_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_FFT, + CUDNN_CONVOLUTION_FWD_ALGO_FFT_TILING, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM, + CUDNN_CONVOLUTION_FWD_ALGO_DIRECT, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD, + CUDNN_CONVOLUTION_FWD_ALGO_WINOGRAD_NONFUSED, }; static constexpr int num_algos = CUDNN_CONVOLUTION_FWD_ALGO_COUNT; - static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, - "Missing cuDNN convolution forward algorithms"); + static_assert( + sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution forward algorithms"); int perf_count; std::unique_ptr perf_results(new perf_t[num_algos]); if (!benchmark) { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardAlgorithm_v7( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - num_algos, - &perf_count, - perf_results.get()), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionForwardAlgorithm_v7( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + num_algos, + &perf_count, + perf_results.get()), + args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionForwardAlgorithmEx( - args.handle, - args.idesc.desc(), args.input.data_ptr(), - args.wdesc.desc(), args.weight.data_ptr(), - args.cdesc.desc(), - args.odesc.desc(), args.output.data_ptr(), - num_algos, - &perf_count, - perf_results.get(), - ws.data, - ws.size), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnFindConvolutionForwardAlgorithmEx( + args.handle, + args.idesc.desc(), + args.input.const_data_ptr(), + args.wdesc.desc(), + args.weight.const_data_ptr(), + args.cdesc.desc(), + args.odesc.desc(), + args.output.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size), + args); // Free the cached blocks in our caching allocator. They are - // needed here because the above benchmarking uses a huge amount of memory, - // e.g. a few GBs. + // needed here because the above benchmarking uses a huge amount of + // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } return getValidAlgorithms(perf_results.get(), args, perf_count); } static void getWorkspaceSize( - const ConvolutionArgs& args, - algo_t algo, size_t* workspaceSize) - { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionForwardWorkspaceSize( - args.handle, - args.idesc.desc(), - args.wdesc.desc(), - args.cdesc.desc(), - args.odesc.desc(), - algo, - workspaceSize), args); + const ConvolutionArgs& args, + algo_t algo, + size_t* workspaceSize) { + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionForwardWorkspaceSize( + args.handle, + args.idesc.desc(), + args.wdesc.desc(), + args.cdesc.desc(), + args.odesc.desc(), + algo, + workspaceSize), + args); } }; -template<> +template <> struct algorithm_search { using perf_t = cudnnConvolutionBwdDataAlgoPerf_t; using algo_t = cudnnConvolutionBwdDataAlgo_t; static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1; - static BenchmarkCache& cache() { return bwd_data_algos; } + static BenchmarkCache& cache() { + return bwd_data_algos; + } - static std::vector findAlgorithms(const ConvolutionArgs& args, bool benchmark) { + static std::vector findAlgorithms( + const ConvolutionArgs& args, + bool benchmark) { static const algo_t algos[] = { CUDNN_CONVOLUTION_BWD_DATA_ALGO_0, CUDNN_CONVOLUTION_BWD_DATA_ALGO_1, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT, CUDNN_CONVOLUTION_BWD_DATA_ALGO_FFT_TILING, CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD, - CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED - }; + CUDNN_CONVOLUTION_BWD_DATA_ALGO_WINOGRAD_NONFUSED}; static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_DATA_ALGO_COUNT; - static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, - "Missing cuDNN convolution backward data algorithms."); + static_assert( + sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution backward data algorithms."); int perf_count; std::unique_ptr perf_results(new perf_t[num_algos]); if (!benchmark) { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataAlgorithm_v7( - args.handle, - args.wdesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.idesc.desc(), - num_algos, - &perf_count, - perf_results.get()), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionBackwardDataAlgorithm_v7( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + num_algos, + &perf_count, + perf_results.get()), + args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardDataAlgorithmEx( - args.handle, - args.wdesc.desc(), args.weight.data_ptr(), - args.odesc.desc(), args.output.data_ptr(), - args.cdesc.desc(), - args.idesc.desc(), args.input.data_ptr(), - num_algos, - &perf_count, - perf_results.get(), - ws.data, - ws.size), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnFindConvolutionBackwardDataAlgorithmEx( + args.handle, + args.wdesc.desc(), + args.weight.const_data_ptr(), + args.odesc.desc(), + args.output.const_data_ptr(), + args.cdesc.desc(), + args.idesc.desc(), + args.input.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size), + args); // Free the cached blocks in our caching allocator. They are - // needed here because the above benchmarking uses a huge amount of memory, - // e.g. a few GBs. + // needed here because the above benchmarking uses a huge amount of + // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } return getValidAlgorithms(perf_results.get(), args, perf_count); } static void getWorkspaceSize( - const ConvolutionArgs& args, - cudnnConvolutionBwdDataAlgo_t algo, size_t* workspaceSize) - { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardDataWorkspaceSize( - args.handle, - args.wdesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.idesc.desc(), - algo, - workspaceSize), args); + const ConvolutionArgs& args, + cudnnConvolutionBwdDataAlgo_t algo, + size_t* workspaceSize) { + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionBackwardDataWorkspaceSize( + args.handle, + args.wdesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.idesc.desc(), + algo, + workspaceSize), + args); } }; -template<> +template <> struct algorithm_search { using perf_t = cudnnConvolutionBwdFilterAlgoPerf_t; using algo_t = cudnnConvolutionBwdFilterAlgo_t; static constexpr auto DEFAULT_ALGO = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1; - static BenchmarkCache& cache() { return bwd_filter_algos; } + static BenchmarkCache& cache() { + return bwd_filter_algos; + } - static std::vector findAlgorithms(const ConvolutionArgs& args, bool benchmark) { + static std::vector findAlgorithms( + const ConvolutionArgs& args, + bool benchmark) { static const algo_t algos[] = { CUDNN_CONVOLUTION_BWD_FILTER_ALGO_0, CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1, @@ -401,68 +451,82 @@ struct algorithm_search { CUDNN_CONVOLUTION_BWD_FILTER_ALGO_FFT_TILING, }; // NOTE: - 1 because ALGO_WINOGRAD is not implemented - static constexpr int num_algos = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1; - static_assert(sizeof(algos) / sizeof(algos[0]) == num_algos, - "Missing cuDNN convolution backward filter algorithms."); + static constexpr int num_algos = + CUDNN_CONVOLUTION_BWD_FILTER_ALGO_COUNT - 1; + static_assert( + sizeof(algos) / sizeof(algos[0]) == num_algos, + "Missing cuDNN convolution backward filter algorithms."); std::unique_ptr perf_results(new perf_t[num_algos]); int perf_count; if (!benchmark) { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterAlgorithm_v7( - args.handle, - args.idesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.wdesc.desc(), - num_algos, - &perf_count, - perf_results.get()), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionBackwardFilterAlgorithm_v7( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + num_algos, + &perf_count, + perf_results.get()), + args); } else { size_t max_ws_size = getMaxWorkspaceSize(args, algos, num_algos); Workspace ws(max_ws_size); at::cuda::errorIfCapturingCudnnBenchmark("cudnnFind"); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnFindConvolutionBackwardFilterAlgorithmEx( - args.handle, - args.idesc.desc(), args.input.data_ptr(), - args.odesc.desc(), args.output.data_ptr(), - args.cdesc.desc(), - args.wdesc.desc(), args.weight.data_ptr(), - num_algos, - &perf_count, - perf_results.get(), - ws.data, - ws.size), args); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnFindConvolutionBackwardFilterAlgorithmEx( + args.handle, + args.idesc.desc(), + args.input.const_data_ptr(), + args.odesc.desc(), + args.output.const_data_ptr(), + args.cdesc.desc(), + args.wdesc.desc(), + args.weight.data_ptr(), + num_algos, + &perf_count, + perf_results.get(), + ws.data, + ws.size), + args); // Free the cached blocks in our caching allocator. They are - // needed here because the above benchmarking uses a huge amount of memory, - // e.g. a few GBs. + // needed here because the above benchmarking uses a huge amount of + // memory, e.g. a few GBs. c10::cuda::CUDACachingAllocator::emptyCache(); } return getValidAlgorithms(perf_results.get(), args, perf_count); } - static void getWorkspaceSize(const ConvolutionArgs& args, algo_t algo, size_t* workspaceSize) - { - AT_CUDNN_CHECK_WITH_SHAPES(cudnnGetConvolutionBackwardFilterWorkspaceSize( - args.handle, - args.idesc.desc(), - args.odesc.desc(), - args.cdesc.desc(), - args.wdesc.desc(), - algo, - workspaceSize), args); + static void getWorkspaceSize( + const ConvolutionArgs& args, + algo_t algo, + size_t* workspaceSize) { + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnGetConvolutionBackwardFilterWorkspaceSize( + args.handle, + args.idesc.desc(), + args.odesc.desc(), + args.cdesc.desc(), + args.wdesc.desc(), + algo, + workspaceSize), + args); } }; -template +template class AlgoIterator { using search = algorithm_search; - const ConvolutionArgs &args; + const ConvolutionArgs& args; bool benchmark; -public: - AlgoIterator(const ConvolutionArgs &args, bool benchmark): args(args), benchmark(benchmark) {} + public: + AlgoIterator(const ConvolutionArgs& args, bool benchmark) + : args(args), benchmark(benchmark) {} - static std::vector onlyDefaultAlgorithm(const ConvolutionArgs &args) { + static std::vector onlyDefaultAlgorithm(const ConvolutionArgs& args) { std::vector perfResults(1); perfResults[0].algo = search::DEFAULT_ALGO; if (args.params.dataType == CUDNN_DATA_HALF) { @@ -473,11 +537,12 @@ class AlgoIterator { perfResults[0].mathType = CUDNN_FMA_MATH; } } - search::getWorkspaceSize(args, perfResults[0].algo, &(perfResults[0].memory)); + search::getWorkspaceSize( + args, perfResults[0].algo, &(perfResults[0].memory)); return perfResults; } - void try_all(std::function f) { + void try_all(std::function f) { bool only_use_default = args.params.deterministic && !benchmark; auto& cache = search::cache(); @@ -486,32 +551,36 @@ class AlgoIterator { try { f(algoPerf); return; - } catch (c10::OutOfMemoryError &e) { + } catch (c10::OutOfMemoryError& e) { cudaGetLastError(); // clear CUDA error } } - auto perfResults = only_use_default ? onlyDefaultAlgorithm(args) : search::findAlgorithms(args, benchmark); - for (auto &algoPerf : perfResults) { + auto perfResults = only_use_default + ? onlyDefaultAlgorithm(args) + : search::findAlgorithms(args, benchmark); + for (auto& algoPerf : perfResults) { try { f(algoPerf); cache.insert(args.params, algoPerf); return; - } catch (c10::OutOfMemoryError &e) { + } catch (c10::OutOfMemoryError& e) { cudaGetLastError(); // clear CUDA error - } catch (c10::CuDNNError &e) { + } catch (c10::CuDNNError& e) { cudaGetLastError(); // clear CUDA error } } - TORCH_CHECK(false, "Unable to find a valid cuDNN algorithm to run convolution"); + TORCH_CHECK( + false, "Unable to find a valid cuDNN algorithm to run convolution"); } }; -inline Tensor allocate_workspace(size_t size, const Tensor &other) { - // Sometimes cuDNN returns a workspace size > 2^63, this could makes the allocation of - // workspace fail with some 64bit indexing error instead of an OOM error. In such case, - // we manually fail with OOM. - TORCH_CHECK_WITH(OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!"); +inline Tensor allocate_workspace(size_t size, const Tensor& other) { + // Sometimes cuDNN returns a workspace size > 2^63, this could makes the + // allocation of workspace fail with some 64bit indexing error instead of an + // OOM error. In such case, we manually fail with OOM. + TORCH_CHECK_WITH( + OutOfMemoryError, size < 1_TiB, "Not enough memory for workspace!"); return at::empty({static_cast(size)}, other.options().dtype(kByte)); } @@ -519,14 +588,14 @@ inline Tensor allocate_workspace(size_t size, const Tensor &other) { // // - raw_cudnn_convolution_forward_out (Tensor) // Functiont that handles tensors that are too large to use 32bit indexing. -// It just split the tensor and dispatches to `raw_cudnn_convolution_forward_out_32bit`. +// It just split the tensor and dispatches to +// `raw_cudnn_convolution_forward_out_32bit`. // // - raw_cudnn_convolution_forward_out_32bit (Tensor) // Low level function which invokes CuDNN, and takes an output // tensor which is directly written to (thus _out). // - // --------------------------------------------------------------------- // // Splitting to 32bit @@ -538,19 +607,36 @@ static inline void split_batch_dim_to_32bit_out( const at::Tensor& output, const at::Tensor& input, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32, - int64_t max_worksize, func_t func_32bit) { + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32, + int64_t max_worksize, + func_t func_32bit) { constexpr int64_t int_max = std::numeric_limits::max(); const int64_t ni = input.numel(); const int64_t no = output.numel(); // Assume the shape of the tensor is (N, C, D1, D2, ...) // if N * C * D1 * D2 * ... <= int_max, then no need to split at all if (ni <= int_max && no <= int_max) { - func_32bit(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + func_32bit( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); return; } - // else, if C * D1 * D2 * ... <= int_max, then we just need to split across the N dimension + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension // // Here we use a simple heuristics to determine the size of each split // We don't max out the 2^31 address space because this number is super @@ -565,30 +651,42 @@ static inline void split_batch_dim_to_32bit_out( int64_t split_size_ = std::min(split_size, n - start); Tensor input_ = input.narrow(0, start, split_size_); Tensor output_ = output.narrow(0, start, split_size_); - func_32bit(output_, input_, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + func_32bit( + output_, + input_, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } return; } - // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated: - // For example, for conv2d, there following questions needs to be considered. + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. // - Is the memory layout NCHW or NHWC ? // - If the conv is NCHW -> NC'H'W', then should we // - split only NC? // - split only N'C'? // - split both? - // - If the conv is NHWC, then we need to split across H, we need to be very careful about the boundary condition + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition // to make sure that the boundary is handled correctly. - // - If we decide to make these splits, is the memory contiguous? Do we need to copy the memory? - // Considering the complexity of this issue, it is better not to use cuDNN for this case + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } - -#define ASSERT_CORRECT_PRECISION(math_type) \ -if (args.params.dataType == CUDNN_DATA_FLOAT) { \ - TORCH_INTERNAL_ASSERT(args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \ -} - +#define ASSERT_CORRECT_PRECISION(math_type) \ + if (args.params.dataType == CUDNN_DATA_FLOAT) { \ + TORCH_INTERNAL_ASSERT( \ + args.params.allow_tf32 || math_type == CUDNN_FMA_MATH); \ + } // --------------------------------------------------------------------- // @@ -597,56 +695,112 @@ if (args.params.dataType == CUDNN_DATA_FLOAT) { // --------------------------------------------------------------------- void raw_cudnn_convolution_forward_out_32bit( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { auto dataType = getCudnnDataType(input); - ConvolutionArgs args{ input, output, weight }; + ConvolutionArgs args{input, output, weight}; args.handle = getCudnnHandle(); - at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, weight); - setConvolutionParams(&args.params, input, weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); + at::MemoryFormat memory_format = + cudnn_conv_suggest_memory_format(input, weight); + setConvolutionParams( + &args.params, + input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32, + memory_format); args.idesc.set(input, memory_format); args.wdesc.set(weight, memory_format, 0); args.odesc.set(output, memory_format); - args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32); + args.cdesc.set( + dataType, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + args.params.allow_tf32); // TODO: when we do legacy group convolution support, we'll repeatedly // reinitialize the workspace for each convolution we do. This is // wasteful; we'd rather reuse the workspace. OTOH, legacy group // convolution support is already pretty slow, so this might not // matter. (This applies to raw_cudnn_convolution_backward_input as well.) - AlgoIterator(args, benchmark).try_all( - [&](const cudnnConvolutionFwdAlgoPerf_t &fwdAlgPerf){ - Tensor workspace = allocate_workspace(fwdAlgPerf.memory, input); - - // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out - // whether to use Tensor core kernels or not - // See Note [behavior of cudnnFind and cudnnGet] - ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), fwdAlgPerf.mathType), args); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionForward( - args.handle, - &one, args.idesc.desc(), input.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), - args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory, - &zero, args.odesc.desc(), output.data_ptr()), - args, "Forward algorithm: ", static_cast(fwdAlgPerf.algo), "\n"); - } - ); -} + AlgoIterator(args, benchmark) + .try_all([&](const cudnnConvolutionFwdAlgoPerf_t& fwdAlgPerf) { + Tensor workspace = allocate_workspace(fwdAlgPerf.memory, input); + + // update convDesc mathType since cudnn 7.4+ now requires both algo + + // mathType to figure out whether to use Tensor core kernels or not See + // Note [behavior of cudnnFind and cudnnGet] + ASSERT_CORRECT_PRECISION(fwdAlgPerf.mathType); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnSetConvolutionMathType( + args.cdesc.mut_desc(), fwdAlgPerf.mathType), + args); + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnConvolutionForward( + args.handle, + &one, + args.idesc.desc(), + input.const_data_ptr(), + args.wdesc.desc(), + weight.const_data_ptr(), + args.cdesc.desc(), + fwdAlgPerf.algo, + workspace.data_ptr(), + fwdAlgPerf.memory, + &zero, + args.odesc.desc(), + output.data_ptr()), + args, + "Forward algorithm: ", + static_cast(fwdAlgPerf.algo), + "\n"); + }); +} void raw_cudnn_convolution_forward_out_v7( - const Tensor& output, const Tensor& input, const Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - split_batch_dim_to_32bit_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 256, raw_cudnn_convolution_forward_out_32bit); + const Tensor& output, + const Tensor& input, + const Tensor& weight, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + split_batch_dim_to_32bit_out( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32, + 1024 * 1024 * 256, + raw_cudnn_convolution_forward_out_32bit); } // --------------------------------------------------------------------- @@ -659,54 +813,112 @@ void raw_cudnn_convolution_backward_input_out_32bit( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { auto dataType = getCudnnDataType(grad_output); - ConvolutionArgs args{ grad_input, grad_output, weight }; + ConvolutionArgs args{grad_input, grad_output, weight}; args.handle = getCudnnHandle(); - at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(grad_input, weight); - setConvolutionParams(&args.params, grad_input, weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); + at::MemoryFormat memory_format = + cudnn_conv_suggest_memory_format(grad_input, weight); + setConvolutionParams( + &args.params, + grad_input, + weight, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32, + memory_format); args.idesc.set(grad_input, memory_format); args.wdesc.set(weight, memory_format, 0); args.odesc.set(grad_output, memory_format); - args.cdesc.set(dataType, grad_output.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32); - - AlgoIterator(args, benchmark).try_all( - [&](const cudnnConvolutionBwdDataAlgoPerf_t &bwdDataAlgPerf){ - Tensor workspace = allocate_workspace(bwdDataAlgPerf.memory, grad_output); - - // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out - // whether to use Tensor core kernels or not - // See Note [behavior of cudnnFind and cudnnGet] - ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), args); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardData( - args.handle, - &one, args.wdesc.desc(), weight.data_ptr(), - args.odesc.desc(), grad_output.data_ptr(), - args.cdesc.desc(), bwdDataAlgPerf.algo, workspace.data_ptr(), bwdDataAlgPerf.memory, - &zero, args.idesc.desc(), grad_input.mutable_data_ptr()), - args, - "Additional pointer addresses: \n", - " grad_output: ", grad_output.data_ptr(), "\n", - " grad_input: ", grad_input.mutable_data_ptr(), "\n", - "Backward data algorithm: ", static_cast(bwdDataAlgPerf.algo), "\n"); - } - ); + args.cdesc.set( + dataType, + grad_output.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + args.params.allow_tf32); + + AlgoIterator(args, benchmark) + .try_all([&](const cudnnConvolutionBwdDataAlgoPerf_t& bwdDataAlgPerf) { + Tensor workspace = + allocate_workspace(bwdDataAlgPerf.memory, grad_output); + + // update convDesc mathType since cudnn 7.4+ now requires both algo + + // mathType to figure out whether to use Tensor core kernels or not See + // Note [behavior of cudnnFind and cudnnGet] + ASSERT_CORRECT_PRECISION(bwdDataAlgPerf.mathType); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnSetConvolutionMathType( + args.cdesc.mut_desc(), bwdDataAlgPerf.mathType), + args); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnConvolutionBackwardData( + args.handle, + &one, + args.wdesc.desc(), + weight.const_data_ptr(), + args.odesc.desc(), + grad_output.const_data_ptr(), + args.cdesc.desc(), + bwdDataAlgPerf.algo, + workspace.data_ptr(), + bwdDataAlgPerf.memory, + &zero, + args.idesc.desc(), + grad_input.mutable_data_ptr()), + args, + "Additional pointer addresses: \n", + " grad_output: ", + grad_output.const_data_ptr(), + "\n", + " grad_input: ", + grad_input.mutable_data_ptr(), + "\n", + "Backward data algorithm: ", + static_cast(bwdDataAlgPerf.algo), + "\n"); + }); } void raw_cudnn_convolution_backward_input_out_v7( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - split_batch_dim_to_32bit_out(grad_input, grad_output, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32, 1024 * 1024 * 128, raw_cudnn_convolution_backward_input_out_32bit); + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { + split_batch_dim_to_32bit_out( + grad_input, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32, + 1024 * 1024 * 128, + raw_cudnn_convolution_backward_input_out_32bit); } // --------------------------------------------------------------------- @@ -716,98 +928,176 @@ void raw_cudnn_convolution_backward_input_out_v7( // --------------------------------------------------------------------- void raw_cudnn_convolution_backward_weight_out_32bit( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { - + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { auto dataType = getCudnnDataType(input); - ConvolutionArgs args{ input, grad_output, grad_weight }; + ConvolutionArgs args{input, grad_output, grad_weight}; args.handle = getCudnnHandle(); - at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, grad_weight); - setConvolutionParams(&args.params, input, grad_weight, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); + at::MemoryFormat memory_format = + cudnn_conv_suggest_memory_format(input, grad_weight); + setConvolutionParams( + &args.params, + input, + grad_weight, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32, + memory_format); args.idesc.set(input, memory_format); args.wdesc.set(grad_weight, memory_format, 0); args.odesc.set(grad_output, memory_format); - args.cdesc.set(dataType, input.dim() - 2, args.params.padding, args.params.stride, args.params.dilation, args.params.groups, args.params.allow_tf32); - - AlgoIterator(args, benchmark).try_all( - [&](const cudnnConvolutionBwdFilterAlgoPerf_t &bwdFilterAlgPerf){ - Tensor workspace = allocate_workspace(bwdFilterAlgPerf.memory, input); - - // update convDesc mathType since cudnn 7.4+ now requires both algo + mathType to figure out - // whether to use Tensor core kernels or not - // See Note [behavior of cudnnFind and cudnnGet] - ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType); - AT_CUDNN_CHECK_WITH_SHAPES(cudnnSetConvolutionMathType(args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), args); - - Constant one(dataType, 1); - Constant zero(dataType, 0); - - AT_CUDNN_CHECK_WITH_SHAPES(cudnnConvolutionBackwardFilter( - args.handle, - &one, args.idesc.desc(), input.data_ptr(), - args.odesc.desc(), grad_output.data_ptr(), - args.cdesc.desc(), bwdFilterAlgPerf.algo, workspace.data_ptr(), bwdFilterAlgPerf.memory, - &zero, args.wdesc.desc(), grad_weight.data_ptr()), - args, - "Additional pointer addresses: \n", - " grad_output: ", grad_output.data_ptr(), "\n", - " grad_weight: ", grad_weight.data_ptr(), "\n", - "Backward filter algorithm: ", static_cast(bwdFilterAlgPerf.algo), "\n"); - } - ); + args.cdesc.set( + dataType, + input.dim() - 2, + args.params.padding, + args.params.stride, + args.params.dilation, + args.params.groups, + args.params.allow_tf32); + + AlgoIterator(args, benchmark) + .try_all( + [&](const cudnnConvolutionBwdFilterAlgoPerf_t& bwdFilterAlgPerf) { + Tensor workspace = + allocate_workspace(bwdFilterAlgPerf.memory, input); + + // update convDesc mathType since cudnn 7.4+ now requires both algo + // + mathType to figure out whether to use Tensor core kernels or + // not See Note [behavior of cudnnFind and cudnnGet] + ASSERT_CORRECT_PRECISION(bwdFilterAlgPerf.mathType); + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnSetConvolutionMathType( + args.cdesc.mut_desc(), bwdFilterAlgPerf.mathType), + args); + + Constant one(dataType, 1); + Constant zero(dataType, 0); + + AT_CUDNN_CHECK_WITH_SHAPES( + cudnnConvolutionBackwardFilter( + args.handle, + &one, + args.idesc.desc(), + input.const_data_ptr(), + args.odesc.desc(), + grad_output.const_data_ptr(), + args.cdesc.desc(), + bwdFilterAlgPerf.algo, + workspace.data_ptr(), + bwdFilterAlgPerf.memory, + &zero, + args.wdesc.desc(), + grad_weight.data_ptr()), + args, + "Additional pointer addresses: \n", + " grad_output: ", + grad_output.const_data_ptr(), + "\n", + " grad_weight: ", + grad_weight.data_ptr(), + "\n", + "Backward filter algorithm: ", + static_cast(bwdFilterAlgPerf.algo), + "\n"); + }); } void raw_cudnn_convolution_backward_weight_out_v7( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, int64_t groups, - bool benchmark, bool deterministic, bool allow_tf32) { + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool benchmark, + bool deterministic, + bool allow_tf32) { constexpr int64_t int_max = std::numeric_limits::max(); const int64_t ni = input.numel(); const int64_t no = grad_output.numel(); // Assume the shape of the tensor is (N, C, D1, D2, ...) // if N * C * D1 * D2 * ... <= int_max, then no need to split at all if (ni <= int_max && no <= int_max) { - raw_cudnn_convolution_backward_weight_out_32bit(grad_weight, grad_output, input, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + raw_cudnn_convolution_backward_weight_out_32bit( + grad_weight, + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); return; } - // else, if C * D1 * D2 * ... <= int_max, then we just need to split across the N dimension + // else, if C * D1 * D2 * ... <= int_max, then we just need to split across + // the N dimension // // Here we use a simple heuristics to determine the size of each split // We don't max out the 2^31 address space because this number is super // large and very likely to get an OOM. int64_t n = grad_output.size(0); int64_t max_inner_size = std::max(ni, no) / n; - int64_t split_size = std::max(1024 * 1024 * 512 / max_inner_size, 1L); + int64_t split_size = + std::max(1024 * 1024 * 512 / max_inner_size, 1L); int64_t num_splits = (n + split_size - 1) / split_size; if (split_size * max_inner_size < int_max) { - const auto kAccType = (grad_weight.scalar_type() == kHalf || grad_weight.scalar_type() == kBFloat16) - ? kFloat : grad_weight.scalar_type(); - Tensor grad_weight_accumulator = at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); + const auto kAccType = (grad_weight.scalar_type() == kHalf || + grad_weight.scalar_type() == kBFloat16) + ? kFloat + : grad_weight.scalar_type(); + Tensor grad_weight_accumulator = + at::zeros(grad_weight.sizes(), grad_weight.options().dtype(kAccType)); for (const auto i : c10::irange(num_splits)) { int64_t start = split_size * i; int64_t split_size_ = std::min(split_size, n - start); Tensor input_ = input.narrow(0, start, split_size_); Tensor grad_output_ = grad_output.narrow(0, start, split_size_); Tensor grad_weight_ = at::empty_like(grad_weight); - raw_cudnn_convolution_backward_weight_out_32bit(grad_weight_, grad_output_, input_, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); + raw_cudnn_convolution_backward_weight_out_32bit( + grad_weight_, + grad_output_, + input_, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); grad_weight_accumulator.add_(grad_weight_); } grad_weight.copy_(grad_weight_accumulator); return; } - // If control flow reaches here, this means even splitting N is not enough, then things starts to become complicated: - // For example, for conv2d, there following questions needs to be considered. + // If control flow reaches here, this means even splitting N is not enough, + // then things starts to become complicated: For example, for conv2d, there + // following questions needs to be considered. // - Is the memory layout NCHW or NHWC ? // - If the conv is NCHW -> NC'H'W', then should we // - split only NC? // - split only N'C'? // - split both? - // - If the conv is NHWC, then we need to split across H, we need to be very careful about the boundary condition + // - If the conv is NHWC, then we need to split across H, we need to be very + // careful about the boundary condition // to make sure that the boundary is handled correctly. - // - If we decide to make these splits, is the memory contiguous? Do we need to copy the memory? - // Considering the complexity of this issue, it is better not to use cuDNN for this case + // - If we decide to make these splits, is the memory contiguous? Do we need + // to copy the memory? Considering the complexity of this issue, it is better + // not to use cuDNN for this case TORCH_INTERNAL_ASSERT(false, "This case should not be dispatched to cuDNN."); } @@ -828,7 +1118,8 @@ void raw_cudnn_convolution_add_relu_out_v7( auto dataType = getCudnnDataType(input); ConvolutionArgs args{input, output, weight}; args.handle = getCudnnHandle(); - at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(input, weight); + at::MemoryFormat memory_format = + cudnn_conv_suggest_memory_format(input, weight); setConvolutionParams( &args.params, input, @@ -882,24 +1173,26 @@ void raw_cudnn_convolution_add_relu_out_v7( args.handle, &one, args.idesc.desc(), - input.data_ptr(), + input.const_data_ptr(), args.wdesc.desc(), - weight.data_ptr(), + weight.const_data_ptr(), args.cdesc.desc(), fwdAlgPerf.algo, workspace.data_ptr(), fwdAlgPerf.memory, &alpha_, zdesc.desc(), - z.data_ptr(), + z.const_data_ptr(), bdesc.desc(), - bias.data_ptr(), + bias.const_data_ptr(), adesc.desc(), args.odesc.desc(), output.data_ptr()), args, - "zdesc: ", zdesc, - "bdesc: ", bdesc, + "zdesc: ", + zdesc, + "bdesc: ", + bdesc, "cudnnConvolutionBiasActivationForward: ", static_cast(fwdAlgPerf.algo), "\n"); @@ -920,17 +1213,29 @@ void raw_cudnn_convolution_add_relu_fallback_out( bool benchmark, bool deterministic, bool allow_tf32) { - // cuDNN Conv-Bias-Activation: // y = act ( alpha1 * conv(x) + alpha2 * z + bias ) - // In pytorch function `raw_cudnn_convolution_add_relu_out`: alpha1 is 1, alpha 2 is `float alpha` + // In pytorch function `raw_cudnn_convolution_add_relu_out`: alpha1 is 1, + // alpha 2 is `float alpha` - raw_cudnn_convolution_forward_out(output, input, weight, padding, stride, dilation, groups, benchmark, deterministic, allow_tf32); - at::Tensor alpha_mul_z_add_bias = at::native::reshape_bias(input.dim(), bias).add(z, alpha); + raw_cudnn_convolution_forward_out( + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); + at::Tensor alpha_mul_z_add_bias = + at::native::reshape_bias(input.dim(), bias).add(z, alpha); output.add_(alpha_mul_z_add_bias); output.relu_(); } -}} // namespace at::native +} // namespace native +} // namespace at #endif diff --git a/aten/src/ATen/native/cudnn/Conv_v8.cpp b/aten/src/ATen/native/cudnn/Conv_v8.cpp index aa582fc19e14c..750cbcca6b6d0 100644 --- a/aten/src/ATen/native/cudnn/Conv_v8.cpp +++ b/aten/src/ATen/native/cudnn/Conv_v8.cpp @@ -1,6 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include // for the definition of AT_CUDNN_ENABLED +#include // for the definition of AT_CUDNN_ENABLED #if AT_CUDNN_ENABLED() @@ -12,23 +12,22 @@ C10_DIAGNOSTIC_PUSH_AND_IGNORED_IF_DEFINED("-Wsuggest-override") #include C10_DIAGNOSTIC_POP() -#include -#include -#include #include +#include #include +#include #include #include #include -#include -#include +#include +#include -#include -#include #include +#include +#include -#include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -40,19 +39,20 @@ C10_DIAGNOSTIC_POP() #include #endif -namespace at { namespace native { +namespace at { +namespace native { namespace { // TODO: remove duplicate code in Conv_v7.cpp -constexpr int64_t operator "" _TiB(unsigned long long n) { +constexpr int64_t operator"" _TiB(unsigned long long n) { return size_t(n) << 40; } -uint8_t getAlignment(const Tensor &t) { +uint8_t getAlignment(const Tensor& t) { // alignment are in bytes uint8_t alignment = 1; - uintptr_t address = reinterpret_cast(t.data_ptr()); + uintptr_t address = reinterpret_cast(t.const_data_ptr()); for (; alignment < 32; alignment *= 2) { if (address % (alignment * 2)) { return alignment; @@ -61,17 +61,25 @@ uint8_t getAlignment(const Tensor &t) { return alignment; } -cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const int64_t id, const uint8_t alignment, const cudnnDataType_t dataType, const at::MemoryFormat memory_format, const bool _virtual) { -#if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && CUDNN_MINOR > 5 - // Workaround for cudnn error handling deficiency, that results in a crash on Ubuntu-22+ - // if `libnvrtc.so` is not found on the system, which strictly speaking is not necessary - // for usecases below - // See https://github.com/pytorch/pytorch/issues/97041 +cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual( + const Tensor& t, + const int64_t id, + const uint8_t alignment, + const cudnnDataType_t dataType, + const at::MemoryFormat memory_format, + const bool _virtual) { +#if defined(__linux__) && !defined(FBCODE_CAFFE2) && CUDNN_MAJOR == 8 && \ + CUDNN_MINOR > 5 + // Workaround for cudnn error handling deficiency, that results in a crash on + // Ubuntu-22+ if `libnvrtc.so` is not found on the system, which strictly + // speaking is not necessary for usecases below See + // https://github.com/pytorch/pytorch/issues/97041 static C10_UNUSED auto cudnn_cnn_infer_handler = [] { - void *handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY); - char *err = dlerror(); + void* handle = dlopen("libcudnn_cnn_infer.so.8", RTLD_LAZY); + char* err = dlerror(); if (!handle) { - TORCH_WARN("Attempt to open cnn_infer failed: handle=", handle, " error: ", err); + TORCH_WARN( + "Attempt to open cnn_infer failed: handle=", handle, " error: ", err); } else if (err) { TORCH_WARN("Applied workaround for CuDNN issue, install nvrtc.so"); } @@ -81,52 +89,74 @@ cudnn_frontend::Tensor getTensorDescriptorWithTypeVirtual(const Tensor &t, const auto sizes = t.sizes(); auto strides = t.strides(); bool channels_last = memory_format == at::MemoryFormat::ChannelsLast || - memory_format == at::MemoryFormat::ChannelsLast3d; - fixSizeOneDimStride(sizes.size(), &sizes[0], (int64_t *) &strides[0], channels_last); + memory_format == at::MemoryFormat::ChannelsLast3d; + fixSizeOneDimStride( + sizes.size(), &sizes[0], (int64_t*)&strides[0], channels_last); auto r = cudnn_frontend::TensorBuilder() - .setDim(sizes.size(), sizes.data()) - .setStrides(strides.size(), strides.data()) - .setId(id) - .setAlignment(alignment) - .setDataType(dataType) - .setVirtual(_virtual) - .build(); + .setDim(sizes.size(), sizes.data()) + .setStrides(strides.size(), strides.data()) + .setId(id) + .setAlignment(alignment) + .setDataType(dataType) + .setVirtual(_virtual) + .build(); return r; } -cudnn_frontend::Tensor getTensorDescriptor(const Tensor &t, const int64_t id, const uint8_t alignment, const at::MemoryFormat memory_format) { - return getTensorDescriptorWithTypeVirtual(t, id, alignment, getCudnnDataType(t), memory_format, false); +cudnn_frontend::Tensor getTensorDescriptor( + const Tensor& t, + const int64_t id, + const uint8_t alignment, + const at::MemoryFormat memory_format) { + return getTensorDescriptorWithTypeVirtual( + t, id, alignment, getCudnnDataType(t), memory_format, false); } -cudnn_frontend::ConvDesc_v8 getConvDescriptor(cudnnDataType_t dataType, IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, const at::ScalarType scalar_type) { +cudnn_frontend::ConvDesc_v8 getConvDescriptor( + cudnnDataType_t dataType, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + const at::ScalarType scalar_type) { uint64_t convDim = stride.size(); if (scalar_type == kBFloat16 || scalar_type == kHalf) { dataType = CUDNN_DATA_FLOAT; } return cudnn_frontend::ConvDescBuilder() - .setDataType(dataType) - .setMathMode(CUDNN_CROSS_CORRELATION) - .setNDims(convDim) - .setStrides(convDim, stride.data()) - .setPrePadding(convDim, padding.data()) - .setPostPadding(convDim, padding.data()) - .setDilation(convDim, dilation.data()) - .build(); + .setDataType(dataType) + .setMathMode(CUDNN_CROSS_CORRELATION) + .setNDims(convDim) + .setStrides(convDim, stride.data()) + .setPrePadding(convDim, padding.data()) + .setPostPadding(convDim, padding.data()) + .setDilation(convDim, dilation.data()) + .build(); } void filterEngineConfigs( - cudnn_frontend::EngineConfigList &from, - cudnn_frontend::EngineConfigList &to, - bool deterministic, bool allow_tf32, c10::ScalarType scalar_type) -{ + cudnn_frontend::EngineConfigList& from, + cudnn_frontend::EngineConfigList& to, + bool deterministic, + bool allow_tf32, + c10::ScalarType scalar_type) { auto filter = [=](cudnnBackendDescriptor_t c) { if (deterministic) { - if (cudnn_frontend::hasNumericalNote(c)) {return true;} + if (cudnn_frontend::hasNumericalNote< + CUDNN_NUMERICAL_NOTE_NONDETERMINISTIC>(c)) { + return true; + } + } + if (cudnn_frontend::hasNumericalNote< + CUDNN_NUMERICAL_NOTE_DOWN_CONVERT_INPUTS>(c)) { + return true; } - if (cudnn_frontend::hasNumericalNote(c)) {return true;} if (scalar_type == kFloat) { // TODO: check under which conditions this is OK - if (!allow_tf32 && cudnn_frontend::hasNumericalNote(c)) {return true;} + if (!allow_tf32 && + cudnn_frontend::hasNumericalNote( + c)) { + return true; + } } return false; }; @@ -149,14 +179,35 @@ struct CacheKeyFused { uint8_t y_alignment; uint8_t z_alignment; uint8_t b_alignment; - // TODO: does it make sense to have this in the key? but alpha is a graph-level param... + // TODO: does it make sense to have this in the key? but alpha is a + // graph-level param... float alpha; }; struct CacheKeyWrapper : ParamsWrapper { - CacheKeyWrapper(const cudnnBackendDescriptorType_t operation, const Tensor& y, const Tensor& x, const Tensor& w, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { + CacheKeyWrapper( + const cudnnBackendDescriptorType_t operation, + const Tensor& y, + const Tensor& x, + const Tensor& w, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + int64_t groups, + bool deterministic, + bool allow_tf32) { at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w); - setConvolutionParams(&(this->pod.params), x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); + setConvolutionParams( + &(this->pod.params), + x, + w, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32, + memory_format); this->pod.operation = operation; this->pod.x_alignment = getAlignment(x); this->pod.y_alignment = getAlignment(y); @@ -165,9 +216,31 @@ struct CacheKeyWrapper : ParamsWrapper { }; struct CacheKeyFusedWrapper : ParamsWrapper { - CacheKeyFusedWrapper(const Tensor& y, const Tensor& x, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, int64_t groups, bool deterministic, bool allow_tf32) { + CacheKeyFusedWrapper( + const Tensor& y, + const Tensor& x, + const Tensor& w, + const Tensor& z, + const Tensor& b, + const float alpha, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + int64_t groups, + bool deterministic, + bool allow_tf32) { at::MemoryFormat memory_format = cudnn_conv_suggest_memory_format(x, w); - setConvolutionParams(&(this->pod).params, x, w, padding, stride, dilation, groups, deterministic, allow_tf32, memory_format); + setConvolutionParams( + &(this->pod).params, + x, + w, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32, + memory_format); this->pod.x_alignment = getAlignment(x); this->pod.y_alignment = getAlignment(y); this->pod.w_alignment = getAlignment(w); @@ -178,177 +251,305 @@ struct CacheKeyFusedWrapper : ParamsWrapper { }; static int getLRUCacheLimit() { - constexpr int DEFAULT_LIMIT = 10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan + constexpr int DEFAULT_LIMIT = + 10000; // roughly corresponds to 2GiB assuming 200KiB per ExecutionPlan // 0 is used to indicate no limit // negative values are used to indicate no caching static int limit = [&] { - const char * val = getenv("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT"); + const char* val = getenv("TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT"); if (!val) { - return DEFAULT_LIMIT; + return DEFAULT_LIMIT; } try { return std::stoi(val); - } catch(std::invalid_argument const& e) { - TORCH_WARN("invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,", - " using default LRU cache limit of ", DEFAULT_LIMIT, " entries."); - } catch(std::out_of_range const& e) { - TORCH_WARN("invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,", - " using default LRU cache limit of ", DEFAULT_LIMIT, " entries."); + } catch (std::invalid_argument const& e) { + TORCH_WARN( + "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,", + " using default LRU cache limit of ", + DEFAULT_LIMIT, + " entries."); + } catch (std::out_of_range const& e) { + TORCH_WARN( + "invalid TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT,", + " using default LRU cache limit of ", + DEFAULT_LIMIT, + " entries."); } return DEFAULT_LIMIT; - } (); + }(); return limit; } template struct BenchmarkCache { -std::list engine_cache_order; -std::unordered_map::iterator>, ParamsWrapperHash> engine_cache; - -// no mutexes here as caches are now thread local for v8, can also return a pointer -// to the Execution Plan if we know it will not be invalidated by another thread -cudnn_frontend::ExecutionPlan* find(const KeyType& key) { - const int lru_cache_limit = getLRUCacheLimit(); - if (lru_cache_limit < 0) { - return nullptr; - } - auto it = engine_cache.find(key); - if (it == engine_cache.end()) { - return nullptr; - } - if (lru_cache_limit) { - // update most recently accessed - engine_cache_order.splice(engine_cache_order.begin(), engine_cache_order, it->second.second); - } - return &(it->second.first); -} + std::list engine_cache_order; + std::unordered_map< + KeyType, + std::pair< + cudnn_frontend::ExecutionPlan, + typename std::list::iterator>, + ParamsWrapperHash> + engine_cache; -void update(const KeyType& key, T& results) { - int lru_cache_limit = getLRUCacheLimit(); - if (lru_cache_limit < 0) { - return; - } else if (lru_cache_limit) { + // no mutexes here as caches are now thread local for v8, can also return a + // pointer to the Execution Plan if we know it will not be invalidated by + // another thread + cudnn_frontend::ExecutionPlan* find(const KeyType& key) { + const int lru_cache_limit = getLRUCacheLimit(); + if (lru_cache_limit < 0) { + return nullptr; + } auto it = engine_cache.find(key); if (it == engine_cache.end()) { - if ((long) engine_cache.size() >= lru_cache_limit) { - auto erase_count = engine_cache.erase(engine_cache_order.back()); - TORCH_INTERNAL_ASSERT(erase_count == 1, "CUDNN V8 LRU Cache Corrupted (eviction key not in map). Please report a bug to PyTorch."); - engine_cache_order.pop_back(); - } - engine_cache_order.emplace_front(key); - engine_cache.emplace(key, std::make_pair(results, engine_cache_order.begin())); - } else { - it->second.first = results; + return nullptr; + } + if (lru_cache_limit) { // update most recently accessed - engine_cache_order.splice(engine_cache_order.begin(), engine_cache_order, it->second.second); + engine_cache_order.splice( + engine_cache_order.begin(), engine_cache_order, it->second.second); } - } else { - engine_cache.erase(key); - engine_cache.emplace(key, std::make_pair(results, engine_cache_order.end())); // dummy iterator + return &(it->second.first); } -} + void update(const KeyType& key, T& results) { + int lru_cache_limit = getLRUCacheLimit(); + if (lru_cache_limit < 0) { + return; + } else if (lru_cache_limit) { + auto it = engine_cache.find(key); + if (it == engine_cache.end()) { + if ((long)engine_cache.size() >= lru_cache_limit) { + auto erase_count = engine_cache.erase(engine_cache_order.back()); + TORCH_INTERNAL_ASSERT( + erase_count == 1, + "CUDNN V8 LRU Cache Corrupted (eviction key not in map). Please report a bug to PyTorch."); + engine_cache_order.pop_back(); + } + engine_cache_order.emplace_front(key); + engine_cache.emplace( + key, std::make_pair(results, engine_cache_order.begin())); + } else { + it->second.first = results; + // update most recently accessed + engine_cache_order.splice( + engine_cache_order.begin(), engine_cache_order, it->second.second); + } + } else { + engine_cache.erase(key); + engine_cache.emplace( + key, + std::make_pair(results, engine_cache_order.end())); // dummy iterator + } + } }; -// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to be thread safe across all engines -// see Limitations in https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html -thread_local BenchmarkCache benchmark_cache; -thread_local BenchmarkCache benchmark_cache_fused; +// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to +// be thread safe across all engines see Limitations in +// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html +thread_local BenchmarkCache + benchmark_cache; +thread_local BenchmarkCache + benchmark_cache_fused; } // namespace -void run_conv_plan(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const cudnn_frontend::ExecutionPlan& plan) { +void run_conv_plan( + cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const cudnn_frontend::ExecutionPlan& plan, + const cudnnBackendDescriptorType_t operation) { c10::DeviceGuard g(x.options().device()); auto workspace_size = plan.getWorkspaceSize(); - auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); - void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()}; + auto workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + void* data_ptrs[3]; + + if (operation == CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) { + data_ptrs[0] = const_cast(x.const_data_ptr()); + data_ptrs[1] = y.data_ptr(); + data_ptrs[2] = const_cast(w.const_data_ptr()); + } else if ( + operation == + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR) { + data_ptrs[0] = x.data_ptr(); + data_ptrs[1] = const_cast(y.const_data_ptr()); + data_ptrs[2] = const_cast(w.const_data_ptr()); + } else if ( + operation == + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR) { + data_ptrs[0] = const_cast(x.const_data_ptr()); + data_ptrs[1] = const_cast(y.const_data_ptr()); + data_ptrs[2] = w.data_ptr(); + } else { + data_ptrs[0] = x.data_ptr(); + data_ptrs[1] = y.data_ptr(); + data_ptrs[2] = w.data_ptr(); + } + int64_t uids[] = {'x', 'y', 'w'}; - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .build(); - AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); + auto variantPack = + cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .build(); + AT_CUDNN_CHECK(cudnnBackendExecute( + handle, plan.get_raw_desc(), variantPack.get_raw_desc())); } -void run_conv_plan_fused(cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const cudnn_frontend::ExecutionPlan& plan) { +void run_conv_plan_fused( + cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b, + const cudnn_frontend::ExecutionPlan& plan) { c10::DeviceGuard g(x.options().device()); auto workspace_size = plan.getWorkspaceSize(); - auto workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); - void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; + auto workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + void* data_ptrs[] = { + x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; int64_t uids[] = {'x', 'y', 'w', 'z', 'b'}; - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) - .setDataPointers(5, data_ptrs) - .setUids(5, uids) - .build(); - AT_CUDNN_CHECK(cudnnBackendExecute(handle, plan.get_raw_desc(), variantPack.get_raw_desc())); + auto variantPack = + cudnn_frontend::VariantPackBuilder() + .setWorkspacePointer(workspace_size ? workspace_ptr.get() : nullptr) + .setDataPointers(5, data_ptrs) + .setUids(5, uids) + .build(); + AT_CUDNN_CHECK(cudnnBackendExecute( + handle, plan.get_raw_desc(), variantPack.get_raw_desc())); } -auto build_opgraph(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) { +auto build_opgraph( + const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const CacheKeyWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation) { auto op = cudnn_frontend::OperationBuilder(desc) - .setxDesc(getTensorDescriptor(x, 'x', key.pod.x_alignment, key.pod.params.memory_format)) - .setyDesc(getTensorDescriptor(y, 'y', key.pod.y_alignment, key.pod.params.memory_format)) - .setwDesc(getTensorDescriptor(w, 'w', key.pod.w_alignment, key.pod.params.memory_format)) - .setcDesc(getConvDescriptor(key.pod.params.dataType, padding, stride, dilation, x.scalar_type())) - .build(); - std::array ops = {&op}; + .setxDesc(getTensorDescriptor( + x, 'x', key.pod.x_alignment, key.pod.params.memory_format)) + .setyDesc(getTensorDescriptor( + y, 'y', key.pod.y_alignment, key.pod.params.memory_format)) + .setwDesc(getTensorDescriptor( + w, 'w', key.pod.w_alignment, key.pod.params.memory_format)) + .setcDesc(getConvDescriptor( + key.pod.params.dataType, + padding, + stride, + dilation, + x.scalar_type())) + .build(); + std::array ops = {&op}; auto opGraph = cudnn_frontend::OperationGraphBuilder() - .setHandle(handle) - .setOperationGraph(ops.size(), ops.data()) - .build(); + .setHandle(handle) + .setOperationGraph(ops.size(), ops.data()) + .build(); return opGraph; } -auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFusedWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation) { - // need computation to be done in FLOAT type regardless of reduced precision input +auto build_opgraph_fused( + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b, + const float alpha, + const CacheKeyFusedWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation) { + // need computation to be done in FLOAT type regardless of reduced precision + // input const auto precision = CUDNN_DATA_FLOAT; auto addDesc = cudnn_frontend::PointWiseDescBuilder() - .setMode(CUDNN_POINTWISE_ADD) - .setMathPrecision(precision) - .build(); + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); auto addBiasDesc = cudnn_frontend::PointWiseDescBuilder() - .setMode(CUDNN_POINTWISE_ADD) - .setMathPrecision(precision) - .build(); + .setMode(CUDNN_POINTWISE_ADD) + .setMathPrecision(precision) + .build(); auto actDesc = cudnn_frontend::PointWiseDescBuilder() - .setMode(CUDNN_POINTWISE_RELU_FWD) - .setMathPrecision(precision) - .build(); - auto convDesc = getConvDescriptor(key.pod.params.dataType, padding, stride, dilation, x.scalar_type()); + .setMode(CUDNN_POINTWISE_RELU_FWD) + .setMathPrecision(precision) + .build(); + auto convDesc = getConvDescriptor( + key.pod.params.dataType, padding, stride, dilation, x.scalar_type()); const float alpha1 = 1.0; const float alpha2 = alpha; - auto conv_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) - .setxDesc(getTensorDescriptor(x, 'x', key.pod.x_alignment, key.pod.params.memory_format)) - // virtual output of conv - .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'C', key.pod.y_alignment, precision, key.pod.params.memory_format, true)) - .setwDesc(getTensorDescriptor(w, 'w', key.pod.w_alignment, key.pod.params.memory_format)) - .setAlpha(alpha1) - .setcDesc(convDesc) - .build(); - auto add_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) - .setxDesc(conv_op.getOutputTensor()) - .setbDesc(getTensorDescriptor(z, 'z', key.pod.z_alignment, key.pod.params.memory_format)) - // another virtual output (of add) - .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'A', key.pod.y_alignment, precision, key.pod.params.memory_format, true)) - .setpwDesc(addDesc) - .setAlpha(alpha1) - .setAlpha2(alpha2) - .build(); - auto add_bias_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) - .setxDesc(add_op.getOutputTensor()) - .setbDesc(getTensorDescriptor(b, 'b', key.pod.b_alignment, key.pod.params.memory_format)) - // another virtual output (of add bias) - .setyDesc(getTensorDescriptorWithTypeVirtual(y, 'B', key.pod.y_alignment, precision, key.pod.params.memory_format, true)) - .setpwDesc(addBiasDesc) - .build(); - auto act_op = cudnn_frontend::OperationBuilder(CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) - .setxDesc(add_bias_op.getOutputTensor()) - // final output is in original datatype - .setyDesc(getTensorDescriptor(y, 'y', key.pod.y_alignment, key.pod.params.memory_format)) - .setpwDesc(actDesc) - .build(); - std::array ops = {&conv_op, &add_op, &add_bias_op, &act_op}; + auto conv_op = + cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR) + .setxDesc(getTensorDescriptor( + x, 'x', key.pod.x_alignment, key.pod.params.memory_format)) + // virtual output of conv + .setyDesc(getTensorDescriptorWithTypeVirtual( + y, + 'C', + key.pod.y_alignment, + precision, + key.pod.params.memory_format, + true)) + .setwDesc(getTensorDescriptor( + w, 'w', key.pod.w_alignment, key.pod.params.memory_format)) + .setAlpha(alpha1) + .setcDesc(convDesc) + .build(); + auto add_op = + cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(conv_op.getOutputTensor()) + .setbDesc(getTensorDescriptor( + z, 'z', key.pod.z_alignment, key.pod.params.memory_format)) + // another virtual output (of add) + .setyDesc(getTensorDescriptorWithTypeVirtual( + y, + 'A', + key.pod.y_alignment, + precision, + key.pod.params.memory_format, + true)) + .setpwDesc(addDesc) + .setAlpha(alpha1) + .setAlpha2(alpha2) + .build(); + auto add_bias_op = + cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_op.getOutputTensor()) + .setbDesc(getTensorDescriptor( + b, 'b', key.pod.b_alignment, key.pod.params.memory_format)) + // another virtual output (of add bias) + .setyDesc(getTensorDescriptorWithTypeVirtual( + y, + 'B', + key.pod.y_alignment, + precision, + key.pod.params.memory_format, + true)) + .setpwDesc(addBiasDesc) + .build(); + auto act_op = + cudnn_frontend::OperationBuilder( + CUDNN_BACKEND_OPERATION_POINTWISE_DESCRIPTOR) + .setxDesc(add_bias_op.getOutputTensor()) + // final output is in original datatype + .setyDesc(getTensorDescriptor( + y, 'y', key.pod.y_alignment, key.pod.params.memory_format)) + .setpwDesc(actDesc) + .build(); + std::array ops = { + &conv_op, &add_op, &add_bias_op, &act_op}; auto opGraph = cudnn_frontend::OperationGraphBuilder() .setHandle(handle) .setOperationGraph(ops.size(), ops.data()) @@ -356,31 +557,55 @@ auto build_opgraph_fused(const cudnnHandle_t handle, const Tensor & x, const Ten return opGraph; } -auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tensor& x, const bool deterministic, const bool allow_tf32, const cudnnBackendHeurMode_t heur_mode, const bool heuristic, const bool fallback) { +auto get_generator_sources( + const cudnnBackendDescriptorType_t& desc, + const Tensor& x, + const bool deterministic, + const bool allow_tf32, + const cudnnBackendHeurMode_t heur_mode, + const bool heuristic, + const bool fallback) { // Method for engine config generator based on heuristics - const auto heurgen_method = [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList { - auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() - .setOperationGraph(opGraph) - .setHeurMode(heur_mode) - .build(); - auto &engine_configs = heuristics.getEngineConfig(heuristics.getEngineConfigCount()); - cudnn_frontend::EngineConfigList filtered_configs; - filterEngineConfigs(engine_configs, filtered_configs, deterministic, allow_tf32, x.scalar_type()); - return filtered_configs; + const auto heurgen_method = + [/*&desc,*/ &x, deterministic, allow_tf32, heur_mode]( + cudnn_frontend::OperationGraph& opGraph) + -> cudnn_frontend::EngineConfigList { + auto heuristics = cudnn_frontend::EngineHeuristicsBuilder() + .setOperationGraph(opGraph) + .setHeurMode(heur_mode) + .build(); + auto& engine_configs = + heuristics.getEngineConfig(heuristics.getEngineConfigCount()); + cudnn_frontend::EngineConfigList filtered_configs; + filterEngineConfigs( + engine_configs, + filtered_configs, + deterministic, + allow_tf32, + x.scalar_type()); + return filtered_configs; }; // Method for engine config generator based on fallback list - const auto fallback_method = [&desc, &x, deterministic, allow_tf32](cudnn_frontend::OperationGraph &opGraph) -> cudnn_frontend::EngineConfigList { + const auto fallback_method = [&desc, &x, deterministic, allow_tf32]( + cudnn_frontend::OperationGraph& opGraph) + -> cudnn_frontend::EngineConfigList { auto fallback = cudnn_frontend::EngineFallbackListBuilder() .setOperationGraph(opGraph) .setOperation(desc) .build(); - auto &fallback_list = fallback.getFallbackList(); + auto& fallback_list = fallback.getFallbackList(); cudnn_frontend::EngineConfigList filtered_configs; - filterEngineConfigs(fallback_list, filtered_configs, deterministic, allow_tf32, x.scalar_type()); + filterEngineConfigs( + fallback_list, + filtered_configs, + deterministic, + allow_tf32, + x.scalar_type()); return filtered_configs; }; if (heuristic && fallback) { - std::vector sources = {heurgen_method, fallback_method}; + std::vector sources = { + heurgen_method, fallback_method}; return sources; } else if (heuristic) { std::vector sources = {heurgen_method}; @@ -392,7 +617,7 @@ auto get_generator_sources(const cudnnBackendDescriptorType_t& desc, const Tenso } int64_t get_available_workspace() { - int device; + c10::DeviceIndex device = 0; C10_CUDA_CHECK(c10::cuda::GetDevice(&device)); size_t max_block_size = 0; c10::cuda::CUDACachingAllocator::cacheInfo(device, &max_block_size); @@ -401,38 +626,55 @@ int64_t get_available_workspace() { static nlohmann::json errata_json_handle; -bool plan_errata_exception(const cudnnHandle_t handle, const std::string & executionPlanTag) { - static bool has_json = cudnn_frontend::load_from_config(errata_json_handle, ""); +bool plan_errata_exception( + const cudnnHandle_t handle, + const std::string& executionPlanTag) { + static bool has_json = + cudnn_frontend::load_from_config(errata_json_handle, ""); if (!has_json) { return false; } else { - return cudnn_frontend::check_errata(errata_json_handle, executionPlanTag, handle, [](){return true;}); + return cudnn_frontend::check_errata( + errata_json_handle, executionPlanTag, handle, []() { return true; }); } } -void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::OperationGraph& opGraph, cudnn_frontend::EngineConfigGenerator& generator, const Tensor& x, cudnn_frontend::executionPlans_t& valid_plans, at::DataPtr& workspace_ptr) { - auto initial_predicate_function = [&](cudnn_frontend::ExecutionPlan const& plan) -> bool { +void generate_and_filter_plans( + const cudnnHandle_t handle, + cudnn_frontend::OperationGraph& opGraph, + cudnn_frontend::EngineConfigGenerator& generator, + const Tensor& x, + cudnn_frontend::executionPlans_t& valid_plans, + at::DataPtr& workspace_ptr) { + auto initial_predicate_function = + [&](cudnn_frontend::ExecutionPlan const& plan) -> bool { return plan_errata_exception(handle, plan.getTag()); }; - auto plans = generator.cudnnGetPlan(handle, opGraph, initial_predicate_function); + auto plans = + generator.cudnnGetPlan(handle, opGraph, initial_predicate_function); int64_t max_block_size = get_available_workspace(); int64_t max_workspace_size = 0; - std::for_each(plans.begin(), plans.end(), [&] (cudnn_frontend::ExecutionPlan& plan) { - int64_t curr_workspace_size = plan.getWorkspaceSize(); - if (curr_workspace_size <= max_block_size) { - if (curr_workspace_size > max_workspace_size) { - max_workspace_size = plan.getWorkspaceSize(); - } - valid_plans.emplace_back(std::move(plan)); - } - }); - TORCH_CHECK_WITH(OutOfMemoryError, max_workspace_size < 1_TiB, "Not enough memory for workspace!"); + std::for_each( + plans.begin(), plans.end(), [&](cudnn_frontend::ExecutionPlan& plan) { + int64_t curr_workspace_size = plan.getWorkspaceSize(); + if (curr_workspace_size <= max_block_size) { + if (curr_workspace_size > max_workspace_size) { + max_workspace_size = plan.getWorkspaceSize(); + } + valid_plans.emplace_back(std::move(plan)); + } + }); + TORCH_CHECK_WITH( + OutOfMemoryError, + max_workspace_size < 1_TiB, + "Not enough memory for workspace!"); bool remove_invalid = false; while (max_workspace_size) { try { - workspace_ptr = c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size); + workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(max_workspace_size); break; - } catch (c10::OutOfMemoryError &e) { + } catch (c10::OutOfMemoryError& e) { max_workspace_size /= 2; (void)cudaGetLastError(); // clear CUDA error remove_invalid = true; @@ -440,7 +682,7 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera } if (remove_invalid) { cudnn_frontend::executionPlans_t new_valid_plans; - for (auto &plan : valid_plans) { + for (auto& plan : valid_plans) { if (plan.getWorkspaceSize() <= max_workspace_size) { new_valid_plans.emplace_back(std::move(plan)); } @@ -449,26 +691,45 @@ void generate_and_filter_plans(const cudnnHandle_t handle, cudnn_frontend::Opera } } -auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32) { - auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); - void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()}; +auto get_plans_from_find( + const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const CacheKeyWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const bool deterministic, + const bool allow_tf32) { + auto opGraph = + build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); + void* data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr()}; int64_t uids[] = {'x', 'y', 'w'}; - // We don't care about getting the best ordering of algos if we're roing to run all of them - auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true); - cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + // We don't care about getting the best ordering of algos if we're roing to + // run all of them + auto sources = get_generator_sources( + desc, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true); + cudnn_frontend::EngineConfigGenerator generator( + sources.size(), sources.data()); cudnn_frontend::executionPlans_t valid_plans; c10::DeviceGuard g(x.options().device()); at::DataPtr workspace_ptr; - generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr); - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setDataPointers(3, data_ptrs) - .setUids(3, uids) - .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) - .build(); + generate_and_filter_plans( + handle, opGraph, generator, x, valid_plans, workspace_ptr); + auto variantPack = + cudnn_frontend::VariantPackBuilder() + .setDataPointers(3, data_ptrs) + .setUids(3, uids) + .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) + .build(); auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN(); benchmark_limit = benchmark_limit ? benchmark_limit : 10000; - auto plans = cudnn_frontend::time_sorted_plan(handle, std::move(valid_plans), variantPack, benchmark_limit); + auto plans = cudnn_frontend::time_sorted_plan< + cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>( + handle, std::move(valid_plans), variantPack, benchmark_limit); cudnn_frontend::executionPlans_t sorted_plans; for (auto& plan : plans) { @@ -477,30 +738,53 @@ auto get_plans_from_find(const cudnnHandle_t handle, const cudnnBackendDescripto return sorted_plans; } -auto get_plans_from_find_fused(const cudnnHandle_t handle, - const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, - const float alpha, const CacheKeyFusedWrapper& key, - const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, - const bool deterministic, const bool allow_tf32) { - auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation); - void *data_ptrs[] = {x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; +auto get_plans_from_find_fused( + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b, + const float alpha, + const CacheKeyFusedWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const bool deterministic, + const bool allow_tf32) { + auto opGraph = build_opgraph_fused( + handle, x, y, w, z, b, alpha, key, padding, stride, dilation); + void* data_ptrs[] = { + x.data_ptr(), y.data_ptr(), w.data_ptr(), z.data_ptr(), b.data_ptr()}; int64_t uids[] = {'x', 'y', 'w', 'z', 'b'}; - auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, CUDNN_HEUR_MODE_INSTANT, true, true); - cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + auto sources = get_generator_sources( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + x, + deterministic, + allow_tf32, + CUDNN_HEUR_MODE_INSTANT, + true, + true); + cudnn_frontend::EngineConfigGenerator generator( + sources.size(), sources.data()); cudnn_frontend::executionPlans_t valid_plans; c10::DeviceGuard g(x.options().device()); at::DataPtr workspace_ptr; - generate_and_filter_plans(handle, opGraph, generator, x, valid_plans, workspace_ptr); - auto variantPack = cudnn_frontend::VariantPackBuilder() - .setDataPointers(5, data_ptrs) - .setUids(5, uids) - .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) - .build(); + generate_and_filter_plans( + handle, opGraph, generator, x, valid_plans, workspace_ptr); + auto variantPack = + cudnn_frontend::VariantPackBuilder() + .setDataPointers(5, data_ptrs) + .setUids(5, uids) + .setWorkspacePointer(workspace_ptr ? workspace_ptr.get() : nullptr) + .build(); auto benchmark_limit = at::globalContext().benchmarkLimitCuDNN(); benchmark_limit = benchmark_limit ? benchmark_limit : 10000; - auto plans = cudnn_frontend::time_sorted_plan(handle, std::move(valid_plans), variantPack, benchmark_limit); + auto plans = cudnn_frontend::time_sorted_plan< + cudnn_frontend::CudnnFindSamplingTechnique::CUDNN_FIND_SAMPLE_ONCE>( + handle, std::move(valid_plans), variantPack, benchmark_limit); cudnn_frontend::executionPlans_t sorted_plans; for (auto& plan : plans) { @@ -509,203 +793,419 @@ auto get_plans_from_find_fused(const cudnnHandle_t handle, return sorted_plans; } - -// We only get configs from this stage to avoid building unnecessary plans that are never executed -auto get_configs_from_heuristics(const cudnnHandle_t handle, const cudnnBackendDescriptorType_t desc, std::string& opgraph_tag, const Tensor& x, const Tensor& y, const Tensor& w, const CacheKeyWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32, const bool fallback) { - auto opGraph = build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); +// We only get configs from this stage to avoid building unnecessary plans that +// are never executed +auto get_configs_from_heuristics( + const cudnnHandle_t handle, + const cudnnBackendDescriptorType_t desc, + std::string& opgraph_tag, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const CacheKeyWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const bool deterministic, + const bool allow_tf32, + const bool fallback) { + auto opGraph = + build_opgraph(handle, desc, x, y, w, key, padding, stride, dilation); opgraph_tag = opGraph.getTag(); - auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT; - auto sources = get_generator_sources(desc, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback); + auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() + ? CUDNN_HEUR_MODE_B + : CUDNN_HEUR_MODE_INSTANT; + auto sources = get_generator_sources( + desc, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback); - cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + cudnn_frontend::EngineConfigGenerator generator( + sources.size(), sources.data()); auto configs = generator.generate_engine_config(opGraph); return configs; } -auto get_configs_from_heuristics_fused(const cudnnHandle_t handle, std::string& opgraph_tag, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, const float alpha, const CacheKeyFusedWrapper& key, const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const bool deterministic, const bool allow_tf32, const bool fallback) { - auto opGraph = build_opgraph_fused(handle, x, y, w, z, b, alpha, key, padding, stride, dilation); +auto get_configs_from_heuristics_fused( + const cudnnHandle_t handle, + std::string& opgraph_tag, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b, + const float alpha, + const CacheKeyFusedWrapper& key, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const bool deterministic, + const bool allow_tf32, + const bool fallback) { + auto opGraph = build_opgraph_fused( + handle, x, y, w, z, b, alpha, key, padding, stride, dilation); opgraph_tag = opGraph.getTag(); - auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() ? CUDNN_HEUR_MODE_B : CUDNN_HEUR_MODE_INSTANT; - auto sources = get_generator_sources(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, x, deterministic, allow_tf32, heuristic_mode, !fallback, fallback); + auto heuristic_mode = at::native::cudnnv8_use_heur_mode_b() + ? CUDNN_HEUR_MODE_B + : CUDNN_HEUR_MODE_INSTANT; + auto sources = get_generator_sources( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + x, + deterministic, + allow_tf32, + heuristic_mode, + !fallback, + fallback); - cudnn_frontend::EngineConfigGenerator generator(sources.size(), sources.data()); + cudnn_frontend::EngineConfigGenerator generator( + sources.size(), sources.data()); auto configs = generator.generate_engine_config(opGraph); return configs; } -void try_plans(cudnn_frontend::executionPlans_t& plans, const CacheKeyWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) { - for (auto & plan : plans) { +void try_plans( + cudnn_frontend::executionPlans_t& plans, + const CacheKeyWrapper& key, + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const cudnnBackendDescriptorType_t operation) { + for (auto& plan : plans) { try { - run_conv_plan(handle, x, y, w, plan); + run_conv_plan(handle, x, y, w, plan, operation); benchmark_cache.update(key, plan); return; - } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {} - catch (c10::OutOfMemoryError &e) { - (void)cudaGetLastError(); // clear CUDA error + } catch (cudnn_frontend::cudnnException& e) { + } catch (CuDNNError& e) { + } catch (c10::OutOfMemoryError& e) { + (void)cudaGetLastError(); // clear CUDA error } } - TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation"); + TORCH_CHECK( + false, "FIND was unable to find an engine to execute this computation"); } -void try_plans_fused(cudnn_frontend::executionPlans_t& plans, const CacheKeyFusedWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) { - for (auto & plan : plans) { +void try_plans_fused( + cudnn_frontend::executionPlans_t& plans, + const CacheKeyFusedWrapper& key, + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b) { + for (auto& plan : plans) { try { run_conv_plan_fused(handle, x, y, w, z, b, plan); benchmark_cache_fused.update(key, plan); return; - } catch (cudnn_frontend::cudnnException &e) {} catch (CuDNNError &e) {} - catch (c10::OutOfMemoryError &e) { - (void)cudaGetLastError(); // clear CUDA error + } catch (cudnn_frontend::cudnnException& e) { + } catch (CuDNNError& e) { + } catch (c10::OutOfMemoryError& e) { + (void)cudaGetLastError(); // clear CUDA error } } - TORCH_CHECK(false, "FIND was unable to find an engine to execute this computation"); + TORCH_CHECK( + false, "FIND was unable to find an engine to execute this computation"); } -bool try_configs(cudnn_frontend::EngineConfigList& configs, const std::string& opgraph_tag, const CacheKeyWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w) { - for (auto & config : configs) { +bool try_configs( + cudnn_frontend::EngineConfigList& configs, + const std::string& opgraph_tag, + const CacheKeyWrapper& key, + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const cudnnBackendDescriptorType_t operation) { + for (auto& config : configs) { try { auto plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(config, opgraph_tag) - .build(); + .setHandle(handle) + .setEngineConfig(config, opgraph_tag) + .build(); if (plan_errata_exception(handle, plan.getTag())) { continue; } - run_conv_plan(handle, x, y, w, plan); + run_conv_plan(handle, x, y, w, plan, operation); benchmark_cache.update(key, plan); return true; - } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {} - catch (c10::OutOfMemoryError &e) { - (void)cudaGetLastError(); // clear CUDA error + } catch (cudnn_frontend::cudnnException& e) { + } catch (CuDNNError& e) { + } catch (c10::OutOfMemoryError& e) { + (void)cudaGetLastError(); // clear CUDA error } } return false; } -bool try_configs_fused(cudnn_frontend::EngineConfigList& configs, const std::string& opgraph_tag, const CacheKeyFusedWrapper& key, const cudnnHandle_t handle, const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b) { - for (auto & config : configs) { +bool try_configs_fused( + cudnn_frontend::EngineConfigList& configs, + const std::string& opgraph_tag, + const CacheKeyFusedWrapper& key, + const cudnnHandle_t handle, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b) { + for (auto& config : configs) { try { auto plan = cudnn_frontend::ExecutionPlanBuilder() - .setHandle(handle) - .setEngineConfig(config, opgraph_tag) - .build(); + .setHandle(handle) + .setEngineConfig(config, opgraph_tag) + .build(); if (plan_errata_exception(handle, plan.getTag())) { continue; } run_conv_plan_fused(handle, x, y, w, z, b, plan); benchmark_cache_fused.update(key, plan); return true; - } catch (cudnn_frontend::cudnnException &e) {} catch(CuDNNError &e) {} - catch (c10::OutOfMemoryError &e) { - (void)cudaGetLastError(); // clear CUDA error + } catch (cudnn_frontend::cudnnException& e) { + } catch (CuDNNError& e) { + } catch (c10::OutOfMemoryError& e) { + (void)cudaGetLastError(); // clear CUDA error } } return false; } -void run_single_conv(const cudnnBackendDescriptorType_t operation, - const Tensor& x, const Tensor& y, const Tensor& w, - const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, - const bool benchmark, const bool deterministic, const bool allow_tf32) { +void run_single_conv( + const cudnnBackendDescriptorType_t operation, + const Tensor& x, + const Tensor& y, + const Tensor& w, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const int64_t groups, + const bool benchmark, + const bool deterministic, + const bool allow_tf32) { cudnnHandle_t handle = getCudnnHandle(); - CacheKeyWrapper key(operation, y, x, w, padding, stride, dilation, groups, deterministic, allow_tf32); + CacheKeyWrapper key( + operation, + y, + x, + w, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32); // TODO: is this thread safe if cache is updated? is pointer stale? auto search = benchmark_cache.find(key); if (search) { try { - run_conv_plan(handle, x, y, w, *search); + run_conv_plan(handle, x, y, w, *search, operation); return; - } catch(c10::OutOfMemoryError &e) { + } catch (c10::OutOfMemoryError& e) { (void)cudaGetLastError(); // clear CUDA error } } if (!benchmark) { std::string opgraph_tag; // extra data needed for errata filter // heuristic configs - cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics(handle, operation, - opgraph_tag, - x, y, w, key, - padding, stride, dilation, - deterministic, allow_tf32, false); - if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) { return; } + cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics( + handle, + operation, + opgraph_tag, + x, + y, + w, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32, + false); + if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) { + return; + } // fallback configs - configs = get_configs_from_heuristics(handle, operation, - opgraph_tag, - x, y, w, key, - padding, stride, dilation, - deterministic, allow_tf32, true); - if (try_configs(configs, opgraph_tag, key, handle, x, y, w)) { return; } - TORCH_CHECK(false, "GET was unable to find an engine to execute this computation"); + configs = get_configs_from_heuristics( + handle, + operation, + opgraph_tag, + x, + y, + w, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32, + true); + if (try_configs(configs, opgraph_tag, key, handle, x, y, w, operation)) { + return; + } + TORCH_CHECK( + false, "GET was unable to find an engine to execute this computation"); } else { - cudnn_frontend::executionPlans_t plans = get_plans_from_find(handle, operation, - x, y, w, key, - padding, stride, dilation, - deterministic, allow_tf32); + cudnn_frontend::executionPlans_t plans = get_plans_from_find( + handle, + operation, + x, + y, + w, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32); // Replicate v7 behavior: clear cached blocks as benchmark incurs // significant memory consumptiont that is not needed after this step if (at::native::_cudnn_get_conv_benchmark_empty_cache()) { c10::cuda::CUDACachingAllocator::emptyCache(); } - try_plans(plans, key, handle, x, y, w); + try_plans(plans, key, handle, x, y, w, operation); } } -void run_fused_conv(const Tensor& x, const Tensor& y, const Tensor& w, const Tensor& z, const Tensor& b, - float alpha, IntArrayRef stride, IntArrayRef padding, IntArrayRef dilation, - int64_t groups, const bool benchmark, const bool deterministic, const bool allow_tf32) { +void run_fused_conv( + const Tensor& x, + const Tensor& y, + const Tensor& w, + const Tensor& z, + const Tensor& b, + float alpha, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + const bool benchmark, + const bool deterministic, + const bool allow_tf32) { cudnnHandle_t handle = getCudnnHandle(); - CacheKeyFusedWrapper key(y, x, w, z, b, alpha, padding, stride, dilation, groups, deterministic, allow_tf32); + CacheKeyFusedWrapper key( + y, + x, + w, + z, + b, + alpha, + padding, + stride, + dilation, + groups, + deterministic, + allow_tf32); auto search = benchmark_cache_fused.find(key); if (search) { try { run_conv_plan_fused(handle, x, y, w, z, b, *search); return; - } catch(c10::OutOfMemoryError &e) { + } catch (c10::OutOfMemoryError& e) { (void)cudaGetLastError(); // clear CUDA error } } if (!benchmark) { std::string opgraph_tag; // extra data needed for errata filter // heuristic configs - cudnn_frontend::EngineConfigList configs = get_configs_from_heuristics_fused(handle, - opgraph_tag, - x, y, w, z, b, alpha, key, - padding, stride, dilation, - deterministic, allow_tf32, false); - if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { return; } + cudnn_frontend::EngineConfigList configs = + get_configs_from_heuristics_fused( + handle, + opgraph_tag, + x, + y, + w, + z, + b, + alpha, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32, + false); + if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { + return; + } // fallback configs - configs = get_configs_from_heuristics_fused(handle, - opgraph_tag, - x, y, w, z, b, alpha, key, - padding, stride, dilation, - deterministic, allow_tf32, true); - if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { return; } - TORCH_CHECK(false, "GET was unable to find an engine to execute this computation"); + configs = get_configs_from_heuristics_fused( + handle, + opgraph_tag, + x, + y, + w, + z, + b, + alpha, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32, + true); + if (try_configs_fused(configs, opgraph_tag, key, handle, x, y, w, z, b)) { + return; + } + TORCH_CHECK( + false, "GET was unable to find an engine to execute this computation"); } else { - cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused(handle, - x, y, w, z, b, alpha, key, - padding, stride, dilation, - deterministic, allow_tf32); + cudnn_frontend::executionPlans_t plans = get_plans_from_find_fused( + handle, + x, + y, + w, + z, + b, + alpha, + key, + padding, + stride, + dilation, + deterministic, + allow_tf32); try_plans_fused(plans, key, handle, x, y, w, z, b); } } void raw_cudnn_convolution_forward_out( - const Tensor& output, const Tensor& input, const Tensor& weight, - const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, - const bool benchmark, const bool deterministic, const bool allow_tf32) -{ - if (output.numel() == 0) { return; } + const Tensor& output, + const Tensor& input, + const Tensor& weight, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const int64_t groups, + const bool benchmark, + const bool deterministic, + const bool allow_tf32) { + if (output.numel() == 0) { + return; + } if (at::native::cudnnv8_enabled_check_debug()) { - run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, - input, output, weight, padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + run_single_conv( + CUDNN_BACKEND_OPERATION_CONVOLUTION_FORWARD_DESCRIPTOR, + input, + output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } else { raw_cudnn_convolution_forward_out_v7( - output, input, weight, - padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + output, + input, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } } @@ -713,37 +1213,83 @@ void raw_cudnn_convolution_backward_input_out( const at::Tensor& grad_input, const at::Tensor& grad_output, const at::Tensor& weight, - const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, - const bool benchmark, const bool deterministic, const bool allow_tf32) { - if (grad_input.numel() == 0) { return; } + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const int64_t groups, + const bool benchmark, + const bool deterministic, + const bool allow_tf32) { + if (grad_input.numel() == 0) { + return; + } if (at::native::cudnnv8_enabled_check_debug()) { - run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, - grad_input, grad_output, weight, padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + run_single_conv( + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_DATA_DESCRIPTOR, + grad_input, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } else { raw_cudnn_convolution_backward_input_out_v7( - grad_input, - grad_output, - weight, - padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + grad_input, + grad_output, + weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } } void raw_cudnn_convolution_backward_weight_out( - const Tensor& grad_weight, const Tensor& grad_output, const Tensor& input, - const IntArrayRef padding, const IntArrayRef stride, const IntArrayRef dilation, const int64_t groups, - const bool benchmark, const bool deterministic, const bool allow_tf32) { - if (grad_weight.numel() == 0) { return; } + const Tensor& grad_weight, + const Tensor& grad_output, + const Tensor& input, + const IntArrayRef padding, + const IntArrayRef stride, + const IntArrayRef dilation, + const int64_t groups, + const bool benchmark, + const bool deterministic, + const bool allow_tf32) { + if (grad_weight.numel() == 0) { + return; + } if (at::native::cudnnv8_enabled_check_debug()) { - run_single_conv(CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, - input, grad_output, grad_weight, padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + run_single_conv( + CUDNN_BACKEND_OPERATION_CONVOLUTION_BACKWARD_FILTER_DESCRIPTOR, + input, + grad_output, + grad_weight, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } else { raw_cudnn_convolution_backward_weight_out_v7( - grad_weight, grad_output, input, - padding, stride, dilation, groups, - benchmark, deterministic, allow_tf32); + grad_weight, + grad_output, + input, + padding, + stride, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } } @@ -761,19 +1307,46 @@ void raw_cudnn_convolution_add_relu_out( bool benchmark, bool deterministic, bool allow_tf32) { - if (output.numel() == 0) { return; } + if (output.numel() == 0) { + return; + } if (at::native::cudnnv8_enabled_check_debug()) { - auto bias_ = input.ndimension() == 4 ? bias.view({1, bias.numel(), 1, 1}) : bias.view({1, bias.numel(), 1, 1, 1}); - run_fused_conv(input, output, weight, z, bias_, - alpha, stride, padding, dilation, - groups, benchmark, deterministic, allow_tf32); + auto bias_ = input.ndimension() == 4 + ? bias.view({1, bias.numel(), 1, 1}) + : bias.view({1, bias.numel(), 1, 1, 1}); + run_fused_conv( + input, + output, + weight, + z, + bias_, + alpha, + stride, + padding, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } else { - raw_cudnn_convolution_add_relu_out_v7(output, input, weight, z, - alpha, bias, stride, padding, dilation, - groups, benchmark, deterministic, allow_tf32); + raw_cudnn_convolution_add_relu_out_v7( + output, + input, + weight, + z, + alpha, + bias, + stride, + padding, + dilation, + groups, + benchmark, + deterministic, + allow_tf32); } } -}} // at::native +} // namespace native +} // namespace at -#endif // AT_CUDNN_ENABLED +#endif // AT_CUDNN_ENABLED diff --git a/aten/src/ATen/native/cudnn/GridSampler.cpp b/aten/src/ATen/native/cudnn/GridSampler.cpp index 8697b89c399af..af6b13567e37c 100644 --- a/aten/src/ATen/native/cudnn/GridSampler.cpp +++ b/aten/src/ATen/native/cudnn/GridSampler.cpp @@ -1,6 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include +#include #include #include @@ -8,58 +8,61 @@ #include #include #else -#include -#include #include +#include +#include #endif #if !AT_CUDNN_ENABLED() -namespace at { namespace native { +namespace at { +namespace native { // See Note [ATen preprocessor philosophy] -Tensor cudnn_grid_sampler_forward( - const Tensor& input_t, const Tensor& grid_t) { +Tensor cudnn_grid_sampler_forward(const Tensor& input_t, const Tensor& grid_t) { AT_ERROR("cudnn_grid_sampler_forward: ATen not compiled with cuDNN support"); } std::tuple cudnn_grid_sampler_backward( - const Tensor& input_t, const Tensor& grid_t, + const Tensor& input_t, + const Tensor& grid_t, const Tensor& grad_output_t) { AT_ERROR("cudnn_grid_sampler_backward: ATen not compiled with cuDNN support"); } -}} +} // namespace native +} // namespace at #else // AT_CUDNN_ENABLED +#include #include #include #include -#include #include #include // TODO: descriptor checking - -namespace at { namespace native { +namespace at { +namespace native { namespace { -void setSamplerDescriptor(SpatialTransformerDescriptor& desc, cudnnDataType_t dataType, const at::Tensor& tensor) -{ +void setSamplerDescriptor( + SpatialTransformerDescriptor& desc, + cudnnDataType_t dataType, + const at::Tensor& tensor) { int inputSize[4] = {0}; for (const auto i : c10::irange(tensor.dim())) { - inputSize[i] = (int) tensor.size(i); + inputSize[i] = (int)tensor.size(i); } desc.set(dataType, 4, inputSize); } -void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) -{ +void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) { // assert size of grid is n*h*w*2 // FYI: grid is between [-1, 1], where -1 left most pixel, // 1 represents right most pixel (and hence 0 is the center pixel) @@ -72,22 +75,19 @@ void checkGridSize(CheckedFrom c, TensorArg grid, TensorArg input) checkSize(c, grid, 3, 2); } -} // namespace +} // namespace -Tensor cudnn_grid_sampler_forward( - const Tensor& input_t, const Tensor& grid_t) -{ +Tensor cudnn_grid_sampler_forward(const Tensor& input_t, const Tensor& grid_t) { // See NOTE [ grid_sampler Native Functions ]. // Add checks here in case this is called instead of grid_sampler. check_grid_sampler_common(input_t, grid_t); TORCH_CHECK( - cond_cudnn_grid_sampler(input_t, grid_t), - "Invalid arguments to cudnn_grid_sampler_forward"); + cond_cudnn_grid_sampler(input_t, grid_t), + "Invalid arguments to cudnn_grid_sampler_forward"); auto input_contig = contiguousIfZeroInStrides(input_t); auto grid_contig = grid_t.contiguous(); - TensorArg input{ input_contig, "input", 1 }, - grid{ grid_contig, "grid", 2 }; + TensorArg input{input_contig, "input", 1}, grid{grid_contig, "grid", 2}; CheckedFrom c = "cudnn_grid_sampler_forward"; checkAllSameGPU(c, {input, grid}); checkAllSameType(c, {input, grid}); @@ -95,10 +95,11 @@ Tensor cudnn_grid_sampler_forward( checkDim(c, input, 4); auto output_t = at::empty({0}, input->options()); - output_t.resize_({input->size(0), input->size(1), grid->size(1), grid->size(2)}); + output_t.resize_( + {input->size(0), input->size(1), grid->size(1), grid->size(2)}); - TensorDescriptor idesc{ *input }; // input descriptor - TensorDescriptor odesc{ output_t }; // output descriptor + TensorDescriptor idesc{*input}; // input descriptor + TensorDescriptor odesc{output_t}; // output descriptor SpatialTransformerDescriptor desc; // sampler descriptor auto handle = getCudnnHandle(); @@ -108,11 +109,15 @@ Tensor cudnn_grid_sampler_forward( Constant one(dataType, 1); Constant zero(dataType, 0); AT_CUDNN_CHECK(cudnnSpatialTfSamplerForward( - handle, desc.desc(), - &one, idesc.desc(), input->data_ptr(), - grid->data_ptr(), - &zero, odesc.desc(), output_t.data_ptr() - )); + handle, + desc.desc(), + &one, + idesc.desc(), + input->const_data_ptr(), + grid->const_data_ptr(), + &zero, + odesc.desc(), + output_t.data_ptr())); return output_t; } @@ -120,22 +125,21 @@ Tensor cudnn_grid_sampler_forward( // NB: CuDNN does not support output mask; you always get both // gradients. std::tuple cudnn_grid_sampler_backward( - const Tensor& input_t, const Tensor& grid_t, - const Tensor& grad_output_t) -{ + const Tensor& input_t, + const Tensor& grid_t, + const Tensor& grad_output_t) { // See NOTE [ grid_sampler Native Functions ]. // Add checks here in case this is called instead of grid_sampler. check_grid_sampler_common(input_t, grid_t); TORCH_CHECK( - cond_cudnn_grid_sampler(input_t, grid_t), - "Invalid arguments to cudnn_grid_sampler_backward"); + cond_cudnn_grid_sampler(input_t, grid_t), + "Invalid arguments to cudnn_grid_sampler_backward"); auto input_contig = contiguousIfZeroInStrides(input_t); auto grid_contig = grid_t.contiguous(); auto grad_output_contig = contiguousIfZeroInStrides(grad_output_t); - TensorArg input{ input_contig, "input", 1 }, - grid{ grid_contig, "grid", 2 }, - grad_output{ grad_output_contig, "grad_output", 3 }; + TensorArg input{input_contig, "input", 1}, grid{grid_contig, "grid", 2}, + grad_output{grad_output_contig, "grad_output", 3}; CheckedFrom c = "cudnn_grid_sampler_backward"; checkAllSameGPU(c, {input, grad_output, grid}); checkGridSize(c, grid, input); @@ -147,9 +151,9 @@ std::tuple cudnn_grid_sampler_backward( auto grad_grid_t = at::empty({0}, grid->options()); grad_grid_t.resize_(grid->sizes()); - TensorDescriptor idesc{ *input }; // input descriptor - TensorDescriptor odesc{ *grad_output }; // grad_output descriptor - TensorDescriptor gdesc{ grad_input_t }; // grad_input descriptor + TensorDescriptor idesc{*input}; // input descriptor + TensorDescriptor odesc{*grad_output}; // grad_output descriptor + TensorDescriptor gdesc{grad_input_t}; // grad_input descriptor SpatialTransformerDescriptor desc; // sampler descriptor auto handle = getCudnnHandle(); @@ -159,18 +163,26 @@ std::tuple cudnn_grid_sampler_backward( Constant one(dataType, 1); Constant zero(dataType, 0); AT_CUDNN_CHECK(cudnnSpatialTfSamplerBackward( - handle, desc.desc(), - &one, idesc.desc(), input->data_ptr(), - &zero, gdesc.desc(), grad_input_t.data_ptr(), - &one, odesc.desc(), grad_output->data_ptr(), - // intruigingly, the outputs don't need descriptors - grid->data_ptr(), - &zero, grad_grid_t.data_ptr() - )); - - return std::tuple{ grad_input_t, grad_grid_t }; + handle, + desc.desc(), + &one, + idesc.desc(), + input->const_data_ptr(), + &zero, + gdesc.desc(), + grad_input_t.data_ptr(), + &one, + odesc.desc(), + grad_output->const_data_ptr(), + // intriguingly, the outputs don't need descriptors + grid->const_data_ptr(), + &zero, + grad_grid_t.data_ptr())); + + return std::tuple{grad_input_t, grad_grid_t}; } -}} // namespace at::cudnn +} // namespace native +} // namespace at #endif diff --git a/aten/src/ATen/native/cudnn/LossCTC.cpp b/aten/src/ATen/native/cudnn/LossCTC.cpp index cb08b57c309c1..dff3bf9b80141 100644 --- a/aten/src/ATen/native/cudnn/LossCTC.cpp +++ b/aten/src/ATen/native/cudnn/LossCTC.cpp @@ -1,9 +1,9 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include +#include #include #if AT_CUDNN_ENABLED() - #include +#include #endif #ifndef AT_PER_OPERATOR_HEADERS @@ -20,7 +20,8 @@ #if (!AT_CUDNN_ENABLED()) -namespace at { namespace native { +namespace at { +namespace native { // See Note [ATen preprocessor philosophy] @@ -42,7 +43,14 @@ bool _use_cudnn_ctc_loss_tensor( return false; } -std::tuple _cudnn_ctc_loss(const Tensor& log_probs, const Tensor& targets, IntArrayRef input_lengths, IntArrayRef target_lengths, int64_t BLANK, bool deterministic, bool zero_infinity) { +std::tuple _cudnn_ctc_loss( + const Tensor& log_probs, + const Tensor& targets, + IntArrayRef input_lengths, + IntArrayRef target_lengths, + int64_t BLANK, + bool deterministic, + bool zero_infinity) { AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); } @@ -57,7 +65,8 @@ std::tuple _cudnn_ctc_loss_tensor( AT_ERROR("cudnn_ctc_loss: ATen not compiled with cuDNN >= 7 support"); } -}} +} // namespace native +} // namespace at #else // AT_CUDNN_ENABLED @@ -68,7 +77,8 @@ std::tuple _cudnn_ctc_loss_tensor( #include #include -namespace at { namespace native { +namespace at { +namespace native { bool _use_cudnn_ctc_loss( const Tensor& log_probs, @@ -82,8 +92,7 @@ bool _use_cudnn_ctc_loss( (targets.dim() == 1) && (log_probs.scalar_type() == at::kFloat) && (targets.scalar_type() == at::kInt) && (log_probs.device().type() == at::kCUDA) && - (targets.device().type() == at::kCPU) && - (targets.is_contiguous()) && + (targets.device().type() == at::kCPU) && (targets.is_contiguous()) && (log_probs.dim() == 3); if (use_cudnn) { @@ -96,8 +105,8 @@ bool _use_cudnn_ctc_loss( for (const auto b : c10::irange(target_lengths.size())) { // target length < 256 is documented, but we see illegal memory accesses // when target lengths > input lengths for CuDNN - use_cudnn = - use_cudnn && (target_lengths[b] < 256) && (target_lengths[b] <= input_lengths[b]); + use_cudnn = use_cudnn && (target_lengths[b] < 256) && + (target_lengths[b] <= input_lengths[b]); } } return use_cudnn; @@ -113,15 +122,21 @@ bool _use_cudnn_ctc_loss_tensor( Tensor tlc = target_lengths.to(Device(at::kCPU), at::kLong).contiguous(); IntArrayRef il(ilc.data_ptr(), ilc.numel()); IntArrayRef tl(tlc.data_ptr(), tlc.numel()); - return at::_use_cudnn_ctc_loss( - log_probs, targets, il, tl, BLANK); + return at::_use_cudnn_ctc_loss(log_probs, targets, il, tl, BLANK); } -std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tensor& targets_t, IntArrayRef input_lengths_, IntArrayRef target_lengths_, int64_t BLANK, bool deterministic, bool zero_infinity) { +std::tuple _cudnn_ctc_loss( + const Tensor& log_probs_t, + const Tensor& targets_t, + IntArrayRef input_lengths_, + IntArrayRef target_lengths_, + int64_t BLANK, + bool deterministic, + bool zero_infinity) { (void)zero_infinity; // only used for backward const CheckedFrom c = "cudnn_ctc_loss"; - const TensorArg log_probs { log_probs_t, "log_probs", 1 }; - const TensorArg targets { targets_t, "targets", 2 }; + const TensorArg log_probs{log_probs_t, "log_probs", 1}; + const TensorArg targets{targets_t, "targets", 2}; checkDim(c, log_probs, 3); checkScalarType(c, log_probs, kFloat); checkDim(c, targets, 1); @@ -130,11 +145,16 @@ std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens checkBackend(c, {*log_probs}, Backend::CUDA); checkBackend(c, {*targets}, Backend::CPU); const auto batch_size = log_probs->size(1); - TORCH_CHECK(static_cast(input_lengths_.size()) == batch_size, "input_lengths needs to have size to match batch_size"); - TORCH_CHECK(static_cast(target_lengths_.size()) == batch_size, "target_lengths needs to have size to match batch_size"); + TORCH_CHECK( + static_cast(input_lengths_.size()) == batch_size, + "input_lengths needs to have size to match batch_size"); + TORCH_CHECK( + static_cast(target_lengths_.size()) == batch_size, + "target_lengths needs to have size to match batch_size"); std::vector input_lengths(input_lengths_.begin(), input_lengths_.end()); - std::vector target_lengths(target_lengths_.begin(), target_lengths_.end()); + std::vector target_lengths( + target_lengths_.begin(), target_lengths_.end()); TORCH_CHECK(BLANK == 0, "blank must be label 0 for cudnn_ctc_loss"); // checked in dispatch: @@ -143,7 +163,9 @@ std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens const auto handle = getCudnnHandle(); - const cudnnCTCLossAlgo_t algo = (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC); + const cudnnCTCLossAlgo_t algo = + (deterministic ? CUDNN_CTC_LOSS_ALGO_DETERMINISTIC + : CUDNN_CTC_LOSS_ALGO_NON_DETERMINISTIC); CTCLossDescriptor ctc_loss_desc; @@ -167,7 +189,8 @@ std::tuple _cudnn_ctc_loss(const Tensor& log_probs_t, const Tens ctc_loss_desc.desc(), &workspace_size)); - Tensor workspace = at::empty(workspace_size, log_probs->options().dtype(kByte)); + Tensor workspace = + at::empty(workspace_size, log_probs->options().dtype(kByte)); Tensor costs = at::empty({log_probs->size(1)}, log_probs->options()); AT_CUDNN_CHECK(cudnnCTCLoss( @@ -203,6 +226,7 @@ std::tuple _cudnn_ctc_loss_tensor( log_probs, targets, il, tl, BLANK, deterministic, zero_infinity); } -}} // namespace at::native +} // namespace native +} // namespace at #endif diff --git a/aten/src/ATen/native/cudnn/MHA.cpp b/aten/src/ATen/native/cudnn/MHA.cpp new file mode 100644 index 0000000000000..1f6bdbf5305a2 --- /dev/null +++ b/aten/src/ATen/native/cudnn/MHA.cpp @@ -0,0 +1,681 @@ +#include +#include +#include + +#if defined(USE_ROCM) || !AT_CUDNN_ENABLED() || \ + (defined(CUDNN_VERSION) && CUDNN_VERSION < 8900) + +namespace at { +namespace native { + +void run_cudnn_SDP_fprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool isTraining, + bool is_causal, + double dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + Tensor& softmaxstats, + Tensor& o, + Tensor& dropoutseed, + Tensor& dropoutoffset) { + TORCH_CHECK( + false, "PyTorch was not compiled with cuDNN Flash Attention enabled!"); +} + +void run_cudnn_SDP_bprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset) { + TORCH_CHECK( + false, "PyTorch was not compiled with cuDNN Flash Attention enabled!"); +} + +} // namespace native +} // namespace at + +#else // AT_CUDNN_ENABLED && defined(CUDNN_VERSION) && CUDNN_VERSION >= 8900 +#include +#include +#include +#include + +#include +#include + +#include +#include + +#include +#include + +#include + +namespace at { +namespace native { + +#include + +namespace fe = cudnn_frontend; +using graph_and_tensors = std::tuple< + std::shared_ptr, + std::shared_ptr, // Q, + std::shared_ptr, // K, + std::shared_ptr, // V, + std::shared_ptr, // Attn_scale, + // TODO(eqy): additional options + // std::shared_ptr, // Bias, + // std::shared_ptr, // SEQ_LEN_Q, + // std::shared_ptr, // SEQ_LEN_KV, + std::shared_ptr, // Seed, + std::shared_ptr, // Offset, + // std::shared_ptr, // Dropout_mask, + // std::shared_ptr, // Dropout_scale + std::shared_ptr, // O + std::shared_ptr // Stats + >; + +using graph_and_tensors_backward = std::tuple< + std::shared_ptr, + std::shared_ptr, // Q, + std::shared_ptr, // K, + std::shared_ptr, // V, + std::shared_ptr, // Attn_scale + std::shared_ptr, // Seed, + std::shared_ptr, // Offset, + std::shared_ptr, // O, + std::shared_ptr, // dO, + std::shared_ptr, // stats, + std::shared_ptr, // dQ, + std::shared_ptr, // dK,, + std::shared_ptr // dV, + >; + +#define MAX_MHA_DIM 4 + +struct MHAParams { + c10::DeviceIndex device_id; + fe::DataType_t dataType; + std::array q_dim; + std::array k_dim; + std::array v_dim; + std::array q_stride; + std::array k_stride; + std::array v_stride; + int64_t b; + int64_t h; + int64_t s_q; + int64_t s_kv; + int64_t d; + double dropout_probability; + bool is_causal; + bool return_softmaxstats; +}; + +void setMHAParams( + MHAParams& params, + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + const Tensor& q, + const Tensor& k, + const Tensor& v, + double dropout_probability, + bool is_causal, + bool return_softmaxstats) { + memset(¶ms, 0, sizeof(MHAParams)); + params.device_id = at::cuda::current_device(); + params.dataType = fe::DataType_t::HALF; + if (q.scalar_type() == kBFloat16) { + params.dataType = fe::DataType_t::BFLOAT16; + } + params.b = b; + params.h = h; + params.d = d; + params.s_q = s_q; + params.s_kv = s_kv; + params.dropout_probability = dropout_probability; + params.is_causal = is_causal; + params.return_softmaxstats = return_softmaxstats; + TORCH_INTERNAL_ASSERT( + q.sizes().size() == MAX_MHA_DIM, + "Q tensor has unexpected number of dims, please report a bug to PyTorch."); + TORCH_INTERNAL_ASSERT( + q.strides().size() == MAX_MHA_DIM, + "Q tensor has unexpected number of dims, please report a bug to PyTorch."); + TORCH_INTERNAL_ASSERT( + k.sizes().size() == MAX_MHA_DIM, + "K tensor has unexpected number of dims, please report a bug to PyTorch."); + TORCH_INTERNAL_ASSERT( + k.strides().size() == MAX_MHA_DIM, + "K tensor has unexpected number of dims, please report a bug to PyTorch."); + TORCH_INTERNAL_ASSERT( + v.sizes().size() == MAX_MHA_DIM, + "V tensor has unexpected number of dims, please report a bug to PyTorch."); + TORCH_INTERNAL_ASSERT( + v.strides().size() == MAX_MHA_DIM, + "V tensor has unexpected number of dims, please report a bug to PyTorch."); + std::copy(q.sizes().begin(), q.sizes().end(), params.q_dim.begin()); + std::copy(q.strides().begin(), q.strides().end(), params.q_stride.begin()); + std::copy(k.sizes().begin(), k.sizes().end(), params.k_dim.begin()); + std::copy(k.strides().begin(), k.strides().end(), params.k_stride.begin()); + std::copy(v.sizes().begin(), v.sizes().end(), params.v_dim.begin()); + std::copy(v.strides().begin(), v.strides().end(), params.v_stride.begin()); +} + +struct MHACacheKeyWrapper : ParamsWrapper { + MHACacheKeyWrapper( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + const Tensor& q, + const Tensor& k, + const Tensor& v, + double dropout_probability, + bool is_causal, + bool return_softmaxstats) { + setMHAParams( + this->pod, + b, + h, + s_q, + s_kv, + d, + q, + k, + v, + dropout_probability, + is_causal, + return_softmaxstats); + } +}; + +template +struct MHAGraphCache { + std::unordered_map> engine_cache; + + // no mutexes here as caches are now thread local for v8, can also return a + // pointer to the Execution Plan if we know it will not be invalidated by + // another thread + T* find(const KeyType& key) { + auto it = engine_cache.find(key); + if (it == engine_cache.end()) { + return nullptr; + } + return &(it->second); + } + + void update(const KeyType& key, T& results) { + engine_cache.erase(key); + engine_cache.emplace(key, std::move(results)); + } +}; + +// @eqy: use thread local caches as cuDNN Execution Plans are not guaranteed to +// be thread safe across all engines see Limitations in +// https://docs.nvidia.com/deeplearning/cudnn/release-notes/index.html +thread_local MHAGraphCache mhagraphcache; +thread_local MHAGraphCache + mhagraphbackwardcache; + +auto build_graph_and_tensors( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool return_softmaxstats, + bool is_causal, + double dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + Tensor& softmaxstats, + Tensor& o, + Tensor& dropoutseed, + Tensor& dropoutoffset, + cudnnHandle_t& handle, + MHAParams& params) { + auto dtype = fe::DataType_t::HALF; + if (q.scalar_type() == kBFloat16) { + dtype = fe::DataType_t::BFLOAT16; + } + auto mha_graph = std::make_shared(); + // We're baking in float accumulation and scale types + // in theory the graph may support other types, but they + // have not been tested + mha_graph->set_io_data_type(dtype) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + auto Q = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("Q") + .set_dim( + std::vector(params.q_dim.begin(), params.q_dim.end())) + .set_stride(std::vector( + params.q_stride.begin(), params.q_stride.end()))); + auto K = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("K") + .set_dim( + std::vector(params.k_dim.begin(), params.k_dim.end())) + .set_stride(std::vector( + params.k_stride.begin(), params.k_stride.end()))); + auto V = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("V") + .set_dim( + std::vector(params.v_dim.begin(), params.v_dim.end())) + .set_stride(std::vector( + params.v_stride.begin(), params.v_stride.end()))); + auto attn_scale = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + // TODO(eqy): support bias in the future in a follow-up PR + // auto bias = mha_graph->tensor(fe::graph::Tensor_attributes() + // .set_name("bias") + // .set_dim({b, 1, s_q, s_kv}) + // .set_stride({s_q * s_kv, s_q * s_kv, s_kv, 1})); + auto seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Seed") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto scaled_dot_product_flash_attention_options = + fe::graph::SDPA_attributes() + .set_name("CUDNN_SDPA") + .set_is_inference(return_softmaxstats == false) + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale) + .set_dropout(dropout_probability, seed, offset); + // Optional bias in flash attention is only supported 8.9.3 onwards + if (cudnnGetVersion() >= 8904) { + // scaled_dot_product_flash_attention_options.set_alibi_mask(true); + } + + auto seq_q = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Seq_q") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto seq_kv = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Seq_kv") + .set_dim({b, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + + // if (cudnnGetVersion() >= 8903) { + // scaled_dot_product_flash_attention_options.set_bias(bias) + // .set_padding_mask(true) + // .set_seq_len_q(seq_q) + // .set_seq_len_kv(seq_kv); + // } + + auto [O, Stats] = + mha_graph->sdpa(Q, K, V, scaled_dot_product_flash_attention_options); + O->set_output(true) + .set_dim(std::vector( + o.sizes().data(), o.sizes().data() + o.sizes().size())) + .set_stride(std::vector( + o.strides().data(), o.strides().data() + o.strides().size())); + + if (Stats) { + Stats->set_output(true).set_data_type(fe::DataType_t::FLOAT); + } + + AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle)); + AT_CUDNN_FRONTEND_CHECK( + mha_graph->create_execution_plans({fe::HeurMode_t::A})); + AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); + + return std::make_tuple( + std::move(mha_graph), + std::move(Q), + std::move(K), + std::move(V), + std::move(attn_scale), + std::move(seed), + std::move(offset), + std::move(O), + std::move(Stats)); +} + +auto build_graph_and_tensors_backward( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset, + cudnnHandle_t& handle, + MHAParams& params) { + auto dtype = fe::DataType_t::HALF; + if (q.scalar_type() == kBFloat16) { + dtype = fe::DataType_t::BFLOAT16; + } + auto mha_graph = std::make_shared(); + // We're baking in float accumulation and scale types + // in theory the graph may support other types, but they + // have not been tested + mha_graph->set_io_data_type(dtype) + .set_intermediate_data_type(fe::DataType_t::FLOAT) + .set_compute_data_type(fe::DataType_t::FLOAT); + auto Q = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("Q") + .set_dim(std::vector(q.sizes().begin(), q.sizes().end())) + .set_stride( + std::vector(q.strides().begin(), q.strides().end()))); + auto K = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("K") + .set_dim(std::vector(k.sizes().begin(), k.sizes().end())) + .set_stride( + std::vector(k.strides().begin(), k.strides().end()))); + auto V = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("V") + .set_dim(std::vector(v.sizes().begin(), v.sizes().end())) + .set_stride( + std::vector(v.strides().begin(), v.strides().end()))); + auto attn_scale = + mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Attn_scale") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_is_pass_by_value(true) + .set_data_type(fe::DataType_t::FLOAT)); + auto Seed = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Seed") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto Offset = mha_graph->tensor(fe::graph::Tensor_attributes() + .set_name("Offset") + .set_dim({1, 1, 1, 1}) + .set_stride({1, 1, 1, 1}) + .set_data_type(fe::DataType_t::INT32)); + auto O = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("O") + .set_dim(std::vector(o.sizes().begin(), o.sizes().end())) + .set_stride( + std::vector(o.strides().begin(), o.strides().end()))); + auto STATS = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("Stats") + .set_dim(std::vector( + softmaxstats.sizes().begin(), softmaxstats.sizes().end())) + .set_stride(std::vector( + softmaxstats.strides().begin(), softmaxstats.strides().end())) + .set_data_type(fe::DataType_t::FLOAT)); + auto DO = mha_graph->tensor( + fe::graph::Tensor_attributes() + .set_name("DO") + .set_dim(std::vector(dO.sizes().begin(), dO.sizes().end())) + .set_stride( + std::vector(dO.strides().begin(), dO.strides().end()))); + auto sdpa_backward_options = fe::graph::SDPA_backward_attributes() + .set_name("CUDNN_SDPA_BACKWARD") + .set_causal_mask(is_causal) + .set_attn_scale(attn_scale); + if (dropout_probability != 0.0f) { + sdpa_backward_options.set_dropout(dropout_probability, Seed, Offset); + } + auto [DQ, DK, DV] = + mha_graph->sdpa_backward(Q, K, V, O, DO, STATS, sdpa_backward_options); + DQ->set_output(true) + .set_dim(std::vector(dQ.sizes().begin(), dQ.sizes().end())) + .set_stride( + std::vector(dQ.strides().begin(), dQ.strides().end())); + DK->set_output(true) + .set_dim(std::vector(dK.sizes().begin(), dK.sizes().end())) + .set_stride( + std::vector(dK.strides().begin(), dK.strides().end())); + DV->set_output(true) + .set_dim(std::vector(dV.sizes().begin(), dV.sizes().end())) + .set_stride( + std::vector(dV.strides().begin(), dV.strides().end())); + AT_CUDNN_FRONTEND_CHECK(mha_graph->validate()); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_operation_graph(handle)); + AT_CUDNN_FRONTEND_CHECK( + mha_graph->create_execution_plans({fe::HeurMode_t::A})); + AT_CUDNN_FRONTEND_CHECK(mha_graph->check_support(handle)); + AT_CUDNN_FRONTEND_CHECK(mha_graph->build_plans(handle)); + return std::make_tuple( + std::move(mha_graph), + std::move(Q), + std::move(K), + std::move(V), + std::move(attn_scale), + std::move(Seed), + std::move(Offset), + std::move(O), + std::move(DO), + std::move(STATS), + std::move(DQ), + std::move(DK), + std::move(DV)); +} + +void run_cudnn_SDP_fprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool return_softmaxstats, + bool is_causal, + double dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + Tensor& softmaxstats, + Tensor& o, + Tensor& dropoutseed, + Tensor& dropoutoffset) { + cudnnHandle_t handle = getCudnnHandle(); + o = at::empty_strided( + {b, h, s_q, d}, {s_q * h * d, d, h * d, 1}, q.options()); + if (return_softmaxstats) { + // TODO(eqy): verify that this is correct + softmaxstats = at::empty({b, h, s_q}, q.options().dtype(kFloat)); + } + + auto key = MHACacheKeyWrapper( + b, + h, + s_q, + s_kv, + d, + q, + k, + v, + dropout_probability, + is_causal, + return_softmaxstats); + auto graph_and_tensors_ptr = mhagraphcache.find(key); + graph_and_tensors graph_and_tensors_values; + if (graph_and_tensors_ptr) { + graph_and_tensors_values = *graph_and_tensors_ptr; + } else { + graph_and_tensors_values = build_graph_and_tensors( + b, + h, + s_q, + s_kv, + d, + scaling_factor, + return_softmaxstats, + is_causal, + dropout_probability, + q, + k, + v, + softmaxstats, + o, + dropoutseed, + dropoutoffset, + handle, + key.pod); + } + auto [mha_graph, Q, K, V, attn_scale, seed, offset, O, Stats] = + graph_and_tensors_values; + std::unordered_map, void*> + variant_pack = { + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {attn_scale, &scaling_factor}, + //{bias, bias.data_ptr()}, + {seed, dropoutseed.data_ptr()}, + {offset, dropoutoffset.data_ptr()}, + {O, o.data_ptr()}}; + if (return_softmaxstats) { + variant_pack[Stats] = softmaxstats.data_ptr(); + } + auto workspace_size = mha_graph->get_workspace_size(); + auto workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + TORCH_CHECK( + mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good()); + mhagraphcache.update(key, graph_and_tensors_values); +} + +void run_cudnn_SDP_bprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset) { + cudnnHandle_t handle = getCudnnHandle(); + auto key = MHACacheKeyWrapper( + b, h, s_q, s_kv, d, q, k, v, dropout_probability, is_causal, true); + auto graph_and_tensors_backward_ptr = mhagraphbackwardcache.find(key); + graph_and_tensors_backward graph_and_tensors_backward_values; + if (graph_and_tensors_backward_ptr) { + graph_and_tensors_backward_values = *graph_and_tensors_backward_ptr; + } else { + graph_and_tensors_backward_values = build_graph_and_tensors_backward( + b, + h, + s_q, + s_kv, + d, + scaling_factor, + is_causal, + dropout_probability, + q, + k, + v, + o, + dO, + softmaxstats, + dQ, + dK, + dV, + dropoutseed, + dropoutoffset, + handle, + key.pod); + } + auto + [mha_graph, Q, K, V, attn_scale, Seed, Offset, O, Do, Stats, Dq, Dk, Dv] = + graph_and_tensors_backward_values; + std::unordered_map, void*> + variant_pack = {// inputs + {Q, q.data_ptr()}, + {K, k.data_ptr()}, + {V, v.data_ptr()}, + {O, o.data_ptr()}, + {Do, dO.data_ptr()}, + {Stats, softmaxstats.data_ptr()}, + // outputs + {Dq, dQ.data_ptr()}, + {Dk, dK.data_ptr()}, + {Dv, dV.data_ptr()}, + // pass by value + {attn_scale, &scaling_factor}}; + if (dropout_probability != 0.0f) { + variant_pack[Seed] = dropoutseed.data_ptr(); + variant_pack[Offset] = dropoutoffset.data_ptr(); + } + auto workspace_size = mha_graph->get_workspace_size(); + auto workspace_ptr = + c10::cuda::CUDACachingAllocator::get()->allocate(workspace_size); + TORCH_CHECK(!workspace_size || workspace_ptr.get()); + TORCH_CHECK( + mha_graph->execute(handle, variant_pack, workspace_ptr.get()).is_good()); + mhagraphbackwardcache.update(key, graph_and_tensors_backward_values); +} + +} // namespace native +} // namespace at + +#endif diff --git a/aten/src/ATen/native/cudnn/MHA.h b/aten/src/ATen/native/cudnn/MHA.h new file mode 100644 index 0000000000000..0406cf783dc53 --- /dev/null +++ b/aten/src/ATen/native/cudnn/MHA.h @@ -0,0 +1,47 @@ +#pragma once +#include + +namespace at { +namespace native { + +void run_cudnn_SDP_fprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool isTraining, + bool is_causal, + double dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + Tensor& softmaxstats, + Tensor& o, + Tensor& dropoutseed, + Tensor& dropoutoffset); + +void run_cudnn_SDP_bprop( + int64_t b, + int64_t h, + int64_t s_q, + int64_t s_kv, + int64_t d, + float scaling_factor, + bool is_causal, + float dropout_probability, + const Tensor& q, + const Tensor& k, + const Tensor& v, + const Tensor& o, + const Tensor& dO, + const Tensor& softmaxstats, + Tensor& dQ, + Tensor& dK, + Tensor& dV, + const Tensor& dropoutseed, + const Tensor& dropoutoffset); + +} // namespace native +} // namespace at diff --git a/aten/src/ATen/native/cudnn/RNN.cpp b/aten/src/ATen/native/cudnn/RNN.cpp index 7b758309d4cbd..05b1df3114f85 100644 --- a/aten/src/ATen/native/cudnn/RNN.cpp +++ b/aten/src/ATen/native/cudnn/RNN.cpp @@ -1,16 +1,16 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS -#include #include +#include +#include +#include #include #include -#include #include -#include #include -#include -#include #include +#include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -29,1038 +29,1222 @@ #if !AT_CUDNN_ENABLED() -namespace at { namespace native { +namespace at { +namespace native { // See Note [ATen preprocessor philosophy] Tensor _cudnn_rnn_flatten_weight( - TensorList weight_arr, int64_t weight_stride0, + TensorList weight_arr, + int64_t weight_stride0, int64_t input_size, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, - bool fn_bidirectional - ) { + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + bool fn_bidirectional) { AT_ERROR("_cudnn_rnn_flatten_weight: ATen not compiled with cuDNN support"); } std::tuple _cudnn_rnn( const Tensor& input_r, - TensorList weight, int64_t weight_stride0, const c10::optional& weight_buf_r_opt, const Tensor& hx, const c10::optional& cx_opt, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, double fn_dropout, - bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, const c10::optional& fn_dropout_state_opt - ) { + TensorList weight, + int64_t weight_stride0, + const c10::optional& weight_buf_r_opt, + const Tensor& hx, + const c10::optional& cx_opt, + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + double fn_dropout, + bool fn_train, + bool fn_bidirectional, + IntArrayRef fn_batch_sizes, + const c10::optional& fn_dropout_state_opt) { AT_ERROR("_cudnn_rnn: ATen not compiled with cuDNN support"); } std::tuple> _cudnn_rnn_backward( - const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional& cx_opt, - const Tensor& output, const c10::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, - int64_t mode, int64_t hidden_size, int64_t proj_size, - int64_t num_layers, bool batch_first, double dropout, - bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional& dropout_state_opt, const Tensor& reserve, - std::array output_mask - ) { + const Tensor& input, + TensorList weight, + int64_t weight_stride0, + const Tensor& weight_buf, + const Tensor& hx, + const c10::optional& cx_opt, + const Tensor& output, + const c10::optional& grad_output_r_opt, + const c10::optional& grad_hy_r_opt, + const c10::optional& grad_cy_r_opt, + int64_t mode, + int64_t hidden_size, + int64_t proj_size, + int64_t num_layers, + bool batch_first, + double dropout, + bool train, + bool bidirectional, + IntArrayRef batch_sizes, + const c10::optional& dropout_state_opt, + const Tensor& reserve, + std::array output_mask) { AT_ERROR("_cudnn_rnn_backward: ATen not compiled with cuDNN support"); } -Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, +Tensor _cudnn_init_dropout_state( + double dropout, + bool train, + int64_t dropout_seed, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] - TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); + TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( + pin_memory); AT_ERROR("_cudnn_init_dropout_state: ATen not compiled with cuDNN support"); } -}} // namespace at::native +} // namespace native +} // namespace at #else // AT_CUDNN_ENABLED() #include -namespace at { namespace native { +namespace at { +namespace native { namespace { - // DropoutDescriptor - - struct DropoutDescriptorParams { - bool train; - double dropout; - Tensor dropout_state; - DropoutDescriptorParams() = default; - void set(bool train_, double dropout_, Tensor dropout_state_) { - train = train_; - dropout = dropout_; - dropout_state = dropout_state_; - } - DropoutDescriptor descriptor(cudnnHandle_t handle) const { - auto dropout_p = train ? dropout : 0; - DropoutDescriptor dropout_desc; - if (dropout_p == 0) { - dropout_desc.set_no_dropout(handle); - } else { - dropout_desc.set(handle, dropout_p, dropout_state); - } - return dropout_desc; +// DropoutDescriptor + +struct DropoutDescriptorParams { + bool train; + double dropout; + Tensor dropout_state; + DropoutDescriptorParams() = default; + void set(bool train_, double dropout_, Tensor dropout_state_) { + train = train_; + dropout = dropout_; + dropout_state = dropout_state_; + } + DropoutDescriptor descriptor(cudnnHandle_t handle) const { + auto dropout_p = train ? dropout : 0; + DropoutDescriptor dropout_desc; + if (dropout_p == 0) { + dropout_desc.set_no_dropout(handle); + } else { + dropout_desc.set(handle, dropout_p, dropout_state); } - }; + return dropout_desc; + } +}; - // RNNDescriptor +// RNNDescriptor - struct RNNDescriptorParams { +struct RNNDescriptorParams { #ifdef USE_CUDNN_RNN_V8_API - int64_t input_size; - bool packed; + int64_t input_size; + bool packed; #endif - int64_t hidden_size; - int64_t proj_size; - int64_t num_layers; - cudnnDirectionMode_t bidirectional; - cudnnRNNMode_t mode; - cudnnDataType_t datatype; - cudnnDataType_t input_datatype; - cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD; - cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT; - - int64_t num_directions() const { - return bidirectional ? 2 : 1; - } + int64_t hidden_size; + int64_t proj_size; + int64_t num_layers; + cudnnDirectionMode_t bidirectional; + cudnnRNNMode_t mode; + cudnnDataType_t datatype; + cudnnDataType_t input_datatype; + cudnnRNNAlgo_t algo = CUDNN_RNN_ALGO_STANDARD; + cudnnRNNInputMode_t input_mode = CUDNN_LINEAR_INPUT; + + int64_t num_directions() const { + return bidirectional ? 2 : 1; + } - void set_mode(int64_t fn_mode) { - switch (fn_mode) { - case CUDNN_RNN_RELU: - mode = CUDNN_RNN_RELU; - break; - case CUDNN_RNN_TANH: - mode = CUDNN_RNN_TANH; - break; - case CUDNN_LSTM: - mode = CUDNN_LSTM; - break; - case CUDNN_GRU: - mode = CUDNN_GRU; - break; - default: - { - std::ostringstream oss; - oss << "unrecognized cuDNN RNN mode " << fn_mode; - AT_ERROR(oss.str()); - } + void set_mode(int64_t fn_mode) { + switch (fn_mode) { + case CUDNN_RNN_RELU: + mode = CUDNN_RNN_RELU; + break; + case CUDNN_RNN_TANH: + mode = CUDNN_RNN_TANH; + break; + case CUDNN_LSTM: + mode = CUDNN_LSTM; + break; + case CUDNN_GRU: + mode = CUDNN_GRU; + break; + default: { + std::ostringstream oss; + oss << "unrecognized cuDNN RNN mode " << fn_mode; + AT_ERROR(oss.str()); } } + } - void set_bidirectional(bool fn_bidirectional) { - bidirectional = fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; - } + void set_bidirectional(bool fn_bidirectional) { + bidirectional = + fn_bidirectional ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL; + } - void set_algo(cudnnRNNAlgo_t algo){ - this->algo = algo; - } + void set_algo(cudnnRNNAlgo_t algo) { + this->algo = algo; + } #ifndef USE_CUDNN_RNN_V8_API - void set(int64_t mode, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) { + void set( + int64_t mode, + int64_t hidden_size, + int64_t proj_size, + int64_t num_layers, + bool bidirectional, + cudnnDataType_t datatype, + cudnnDataType_t input_datatype){ #else - void set(int64_t mode, int64_t input_size, bool packed, int64_t hidden_size, int64_t proj_size, int64_t num_layers, bool bidirectional, cudnnDataType_t datatype, cudnnDataType_t input_datatype) { + void set( + int64_t mode, + int64_t input_size, + bool packed, + int64_t hidden_size, + int64_t proj_size, + int64_t num_layers, + bool bidirectional, + cudnnDataType_t datatype, + cudnnDataType_t input_datatype) { #endif this->set_mode(mode); #ifdef USE_CUDNN_RNN_V8_API - this->input_size = input_size; - this->packed = packed; + this->input_size = input_size; + this->packed = packed; #endif - this->hidden_size = hidden_size; - this->proj_size = proj_size; - this->num_layers = num_layers; - this->set_bidirectional(bidirectional); - this->datatype = datatype; - this->input_datatype = input_datatype; - } + this->hidden_size = hidden_size; + this->proj_size = proj_size; + this->num_layers = num_layers; + this->set_bidirectional(bidirectional); + this->datatype = datatype; + this->input_datatype = input_datatype; +} - RNNDescriptor descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const { - RNNDescriptor rnn_desc; +RNNDescriptor +descriptor(cudnnHandle_t handle, DropoutDescriptor&& dropout_desc) const { + RNNDescriptor rnn_desc; #ifndef USE_CUDNN_RNN_V8_API - rnn_desc.set(handle, hidden_size, proj_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN()); + rnn_desc.set( + handle, + hidden_size, + proj_size, + num_layers, + std::move(dropout_desc), + input_mode, + bidirectional, + mode, + datatype, + input_datatype, + algo, + at::globalContext().allowTF32CuDNN()); #else - rnn_desc.set(handle, input_size, packed, hidden_size, proj_size, num_layers, std::move(dropout_desc), input_mode, bidirectional, mode, datatype, input_datatype, algo, at::globalContext().allowTF32CuDNN()); + rnn_desc.set( + handle, + input_size, + packed, + hidden_size, + proj_size, + num_layers, + std::move(dropout_desc), + input_mode, + bidirectional, + mode, + datatype, + input_datatype, + algo, + at::globalContext().allowTF32CuDNN()); #endif - return rnn_desc; - } + return rnn_desc; +} - // In some cases, a use of RNNDescriptor does not rely on the - // DropoutDescriptor. In this case, we fake up a no-dropout - // descriptor to make the RNN descriptor initialization go through. - // This is used by _cudnn_rnn_flatten_weight, which needs an - // RNNDescriptor for get_parameters(), but does not actually need - // a fully initialized dropout descriptor. This lets us avoid - // having to pass the dropout state to flatten, which has no business - // knowing what the dropout state is. - RNNDescriptor descriptor(cudnnHandle_t handle) const { - DropoutDescriptor dropout_desc; - dropout_desc.set_no_dropout(handle); - return descriptor(handle, std::move(dropout_desc)); - } - }; +// In some cases, a use of RNNDescriptor does not rely on the +// DropoutDescriptor. In this case, we fake up a no-dropout +// descriptor to make the RNN descriptor initialization go through. +// This is used by _cudnn_rnn_flatten_weight, which needs an +// RNNDescriptor for get_parameters(), but does not actually need +// a fully initialized dropout descriptor. This lets us avoid +// having to pass the dropout state to flatten, which has no business +// knowing what the dropout state is. +RNNDescriptor descriptor(cudnnHandle_t handle) const { + DropoutDescriptor dropout_desc; + dropout_desc.set_no_dropout(handle); + return descriptor(handle, std::move(dropout_desc)); +} +}; // namespace - // TensorDescriptor list +// TensorDescriptor list #ifndef USE_CUDNN_RNN_V8_API - std::vector rnn_descriptor_sequence(const Tensor& tensor, IntArrayRef batch_sizes) { - std::vector descriptors(batch_sizes.size()); - size_t i = 0; - // To be mutated in the loop - auto batch_tensor_size = tensor.sizes().vec(); - for (auto batch_size : batch_sizes) { - batch_tensor_size[0] = batch_size; - // NB: cuDNN RNN API does not support 2d descriptors, so we - // must pad it out to 3d. - descriptors[i].set(getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3); - i++; - } - return descriptors; +std::vector rnn_descriptor_sequence( + const Tensor& tensor, + IntArrayRef batch_sizes) { + std::vector descriptors(batch_sizes.size()); + size_t i = 0; + // To be mutated in the loop + auto batch_tensor_size = tensor.sizes().vec(); + for (auto batch_size : batch_sizes) { + batch_tensor_size[0] = batch_size; + // NB: cuDNN RNN API does not support 2d descriptors, so we + // must pad it out to 3d. + descriptors[i].set( + getCudnnDataType(tensor), batch_tensor_size, tensor.strides(), 3); + i++; } + return descriptors; +} - std::vector rnn_descriptor(const Tensor& tensor, int64_t N) { - std::vector descriptors(N); - for (const auto i : c10::irange(N)) { - descriptors[i].set(tensor, 5); - } - return descriptors; +std::vector rnn_descriptor(const Tensor& tensor, int64_t N) { + std::vector descriptors(N); + for (const auto i : c10::irange(N)) { + descriptors[i].set(tensor, 5); } + return descriptors; +} #else - auto rnn_descriptor_sequence(const Tensor& tensor, uint32_t batch_size, const IntArrayRef batch_sizes, uint32_t seq_len, uint32_t vector_size) { // packed case - RNNDataDescriptor r; - std::vector seqLengthArray(batch_size, 1); - // cuDNN wants the sequence lenghts for a packed batch as if they - // were unpacked, e.g., for the - // Sequence 1: ABCD - // Sequence 2: EF - // Sequence 3: G - // case below, this would be [4, 2, 1] (has length == mini_batch) - // TODO(eqy): There's probably a smarter way to do this than O(SN) - for (auto it = batch_sizes.begin(); it != batch_sizes.end(); it++) { - // everyone starts at sequence length 1 so we skip an iteration - if (it == batch_sizes.begin()) { - continue; - } - for (const auto idx : c10::irange(*it)) { - seqLengthArray[idx]++; - } +auto rnn_descriptor_sequence( + const Tensor& tensor, + uint32_t batch_size, + const IntArrayRef batch_sizes, + uint32_t seq_len, + uint32_t vector_size) { // packed case + RNNDataDescriptor r; + std::vector seqLengthArray(batch_size, 1); + // cuDNN wants the sequence lengths for a packed batch as if they + // were unpacked, e.g., for the + // Sequence 1: ABCD + // Sequence 2: EF + // Sequence 3: G + // case below, this would be [4, 2, 1] (has length == mini_batch) + // TODO(eqy): There's probably a smarter way to do this than O(SN) + for (auto it = batch_sizes.begin(); it != batch_sizes.end(); it++) { + // everyone starts at sequence length 1 so we skip an iteration + if (it == batch_sizes.begin()) { + continue; + } + for (const auto idx : c10::irange(*it)) { + seqLengthArray[idx]++; } - r.set(tensor, CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, seq_len, batch_size, vector_size, seqLengthArray.data()); - return r; } + r.set( + tensor, + CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_PACKED, + seq_len, + batch_size, + vector_size, + seqLengthArray.data()); + return r; +} - auto rnn_descriptor(const Tensor& tensor, uint32_t batch_size, uint32_t seq_len, uint32_t vector_size) { - RNNDataDescriptor r; - // NB: Looks like even if batch_first is true here we always want SEQ_MAJOR_UNPACKED, because the input - // appears to be transposed if it is barch-major - const auto layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED; - std::vector seqLengthArray(batch_size, seq_len); - r.set(tensor, layout, seq_len, batch_size, vector_size, seqLengthArray.data()); - return r; - } +auto rnn_descriptor( + const Tensor& tensor, + uint32_t batch_size, + uint32_t seq_len, + uint32_t vector_size) { + RNNDataDescriptor r; + // NB: Looks like even if batch_first is true here we always want + // SEQ_MAJOR_UNPACKED, because the input appears to be transposed if it is + // barch-major + const auto layout = CUDNN_RNN_DATA_LAYOUT_SEQ_MAJOR_UNPACKED; + std::vector seqLengthArray(batch_size, seq_len); + r.set( + tensor, layout, seq_len, batch_size, vector_size, seqLengthArray.data()); + return r; +} #endif - // The best way to understand the meaning of the values stored in - // this struct is to consider each of the possible ways our - // input can be structured. - // - // Suppose you want to run RNN on the following variable - // length inputs: - // - // Sequence 1: ABCD - // Sequence 2: EF - // Sequence 3: G - // - // (Let _ be padding when we have non-packed representations.) - // - // # Packed input (batch_sizes is non-empty) - // - // input_size - // +------+ + - // | A | | - // | E | mini_batch = | - // | G | batch_sizes[0] = 3 | - // +------+ | - // | B | | batch_sizes_sum = 7 - // | F | batch_sizes[1] = 2 | - // +------+ | - // | C | batch_sizes[2] = 1 | - // +------+ | - // | D | batch_sizes[3] = 1 | - // +------+ + - // - // (seq_length = 4) - // - // input.size() = batch_sizes_sum x input_size - // - // # Unpacked input (batch_first = false) - // - // mini_batch = 3 - // +-------+ - // | A E G | - // | B F _ | seq_length = 4 - // | C _ _ | - // | D _ _ | - // +-------+ - // ... input_size - // +-------+ - // - // input.size() = seq_length x mini_batch x input_size - // - // # Unpacked input (batch_first = true) - // - // seq_length = 4 - // +---------+ - // | A B C D | - // | E F _ _ | mini_batch = 3 - // | G _ _ _ | - // +---------+ - // ... input_size - // +---------+ - // - // input.size() = mini_batch x seq_length x input_size - // - struct TensorDescriptorListParams { - IntArrayRef batch_sizes; - int64_t seq_length; - int64_t mini_batch; - // NB: this is not input.size(), which is an IntArrayRef; instead, this - // size of the inner-most dimension. In NL applications, this is usually - // the size of the embedding. You can also think of this as the size - // of the "channel" dimension (at risk of confusing vision researchers :) - int64_t input_size; - // Only valid when !is_input_packed - int64_t batch_sizes_sum; // == sum(batch_sizes) - - bool is_input_packed() const { - return batch_sizes.size() != 0; - } +// The best way to understand the meaning of the values stored in +// this struct is to consider each of the possible ways our +// input can be structured. +// +// Suppose you want to run RNN on the following variable +// length inputs: +// +// Sequence 1: ABCD +// Sequence 2: EF +// Sequence 3: G +// +// (Let _ be padding when we have non-packed representations.) +// +// # Packed input (batch_sizes is non-empty) +// +// input_size +// +------+ + +// | A | | +// | E | mini_batch = | +// | G | batch_sizes[0] = 3 | +// +------+ | +// | B | | batch_sizes_sum = 7 +// | F | batch_sizes[1] = 2 | +// +------+ | +// | C | batch_sizes[2] = 1 | +// +------+ | +// | D | batch_sizes[3] = 1 | +// +------+ + +// +// (seq_length = 4) +// +// input.size() = batch_sizes_sum x input_size +// +// # Unpacked input (batch_first = false) +// +// mini_batch = 3 +// +-------+ +// | A E G | +// | B F _ | seq_length = 4 +// | C _ _ | +// | D _ _ | +// +-------+ +// ... input_size +// +-------+ +// +// input.size() = seq_length x mini_batch x input_size +// +// # Unpacked input (batch_first = true) +// +// seq_length = 4 +// +---------+ +// | A B C D | +// | E F _ _ | mini_batch = 3 +// | G _ _ _ | +// +---------+ +// ... input_size +// +---------+ +// +// input.size() = mini_batch x seq_length x input_size +// +struct TensorDescriptorListParams { + IntArrayRef batch_sizes; + int64_t seq_length; + int64_t mini_batch; + // NB: this is not input.size(), which is an IntArrayRef; instead, this + // size of the inner-most dimension. In NL applications, this is usually + // the size of the embedding. You can also think of this as the size + // of the "channel" dimension (at risk of confusing vision researchers :) + int64_t input_size; + // Only valid when !is_input_packed + int64_t batch_sizes_sum; // == sum(batch_sizes) + + bool is_input_packed() const { + return batch_sizes.size() != 0; + } - void set(IntArrayRef input_sizes, IntArrayRef batch_sizes_, bool batch_first) { - batch_sizes = batch_sizes_; - if (is_input_packed()) { - seq_length = batch_sizes.size(); - mini_batch = batch_sizes[0]; - // NB: When input is packed, the mini_batch size is NOT the size - // of the outer dimension - batch_sizes_sum = input_sizes[0]; - input_size = input_sizes[1]; + void set( + IntArrayRef input_sizes, + IntArrayRef batch_sizes_, + bool batch_first) { + batch_sizes = batch_sizes_; + if (is_input_packed()) { + seq_length = batch_sizes.size(); + mini_batch = batch_sizes[0]; + // NB: When input is packed, the mini_batch size is NOT the size + // of the outer dimension + batch_sizes_sum = input_sizes[0]; + input_size = input_sizes[1]; + } else { + if (batch_first) { + seq_length = input_sizes[1]; + mini_batch = input_sizes[0]; } else { - if (batch_first) { - seq_length = input_sizes[1]; - mini_batch = input_sizes[0]; - } else { - seq_length = input_sizes[0]; - mini_batch = input_sizes[1]; - } - input_size = input_sizes[2]; - // TODO: Actually, would this make ASAN's job harder catching - // an uninitialized access? - batch_sizes_sum = -1; // something bogus in case we access it + seq_length = input_sizes[0]; + mini_batch = input_sizes[1]; } + input_size = input_sizes[2]; + // TODO: Actually, would this make ASAN's job harder catching + // an uninitialized access? + batch_sizes_sum = -1; // something bogus in case we access it } + } #ifndef USE_CUDNN_RNN_V8_API - // TODO: check x for consistency with input_size? - std::vector descriptors(Tensor x) const { - auto is_input_packed = batch_sizes.size() != 0; - if (is_input_packed) { - return rnn_descriptor_sequence(x, batch_sizes); - } else { - return rnn_descriptor(x[0], seq_length); - } + // TODO: check x for consistency with input_size? + std::vector descriptors(Tensor x) const { + auto is_input_packed = batch_sizes.size() != 0; + if (is_input_packed) { + return rnn_descriptor_sequence(x, batch_sizes); + } else { + return rnn_descriptor(x[0], seq_length); } + } #else - auto descriptors(Tensor x) const { - auto is_input_packed = batch_sizes.size() != 0; - if (is_input_packed) { - return rnn_descriptor_sequence(x, mini_batch, batch_sizes, seq_length, x.size(-1)); - } else { - return rnn_descriptor(x, mini_batch, seq_length, x.size(-1)); - } + auto descriptors(Tensor x) const { + auto is_input_packed = batch_sizes.size() != 0; + if (is_input_packed) { + return rnn_descriptor_sequence( + x, mini_batch, batch_sizes, seq_length, x.size(-1)); + } else { + return rnn_descriptor(x, mini_batch, seq_length, x.size(-1)); } + } #endif - }; +}; - // Everything together +// Everything together - struct RNNParams { - DropoutDescriptorParams dropout; - RNNDescriptorParams rnn; - TensorDescriptorListParams tensors; - }; +struct RNNParams { + DropoutDescriptorParams dropout; + RNNDescriptorParams rnn; + TensorDescriptorListParams tensors; +}; - // NB: Doesn't include the weight descriptor - struct RNNDescriptors { - RNNDescriptor rnn_desc; - // NB: this won't actually lay out the tensor descriptor pointers - // in the right way, so you'll have to preprocess them +// NB: Doesn't include the weight descriptor +struct RNNDescriptors { + RNNDescriptor rnn_desc; + // NB: this won't actually lay out the tensor descriptor pointers + // in the right way, so you'll have to preprocess them #ifndef USE_CUDNN_RNN_V8_API - std::vector x_descs; - std::vector y_descs; + std::vector x_descs; + std::vector y_descs; #else - RNNDataDescriptor x_descs; - RNNDataDescriptor y_descs; + RNNDataDescriptor x_descs; + RNNDataDescriptor y_descs; #endif - TensorDescriptor hx_desc; - TensorDescriptor hy_desc; - TensorDescriptor cx_desc; - TensorDescriptor cy_desc; - - RNNDescriptors(const RNNParams& fn, cudnnHandle_t handle, Tensor x, Tensor y, Tensor hx, Tensor cx) { - rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle)); - x_descs = fn.tensors.descriptors(x); - y_descs = fn.tensors.descriptors(y); - hx_desc.set(hx, 5); - hy_desc.set(hx, 5); - if (cx.defined()) { - cx_desc.set(cx, 5); - cy_desc.set(cx, 5); - } + TensorDescriptor hx_desc; + TensorDescriptor hy_desc; + TensorDescriptor cx_desc; + TensorDescriptor cy_desc; + + RNNDescriptors( + const RNNParams& fn, + cudnnHandle_t handle, + Tensor x, + Tensor y, + Tensor hx, + Tensor cx) { + rnn_desc = fn.rnn.descriptor(handle, fn.dropout.descriptor(handle)); + x_descs = fn.tensors.descriptors(x); + y_descs = fn.tensors.descriptors(y); + hx_desc.set(hx, 5); + hy_desc.set(hx, 5); + if (cx.defined()) { + cx_desc.set(cx, 5); + cy_desc.set(cx, 5); } + } - // TODO: This is annoying, having to put the cudnnTensorDescriptor_t - // in a contiguous array... - std::vector get_descs(const std::vector& descs) { - std::vector r; - r.reserve(descs.size()); - for (auto& desc : descs) { - r.emplace_back(desc.desc()); - } - return r; + // TODO: This is annoying, having to put the cudnnTensorDescriptor_t + // in a contiguous array... + std::vector get_descs( + const std::vector& descs) { + std::vector r; + r.reserve(descs.size()); + for (auto& desc : descs) { + r.emplace_back(desc.desc()); } + return r; + } #ifndef USE_CUDNN_RNN_V8_API - std::vector get_x_descs() { - return get_descs(x_descs); - } + std::vector get_x_descs() { + return get_descs(x_descs); + } - std::vector get_y_descs() { - return get_descs(y_descs); - } + std::vector get_y_descs() { + return get_descs(y_descs); + } #endif - }; +}; - int64_t get_num_weights(cudnnHandle_t handle, const RNNDescriptor& rnn_desc, +int64_t get_num_weights( + cudnnHandle_t handle, + const RNNDescriptor& rnn_desc, #ifndef USE_CUDNN_RNN_V8_API - const TensorDescriptor& x_desc, + const TensorDescriptor& x_desc, #endif - cudnnDataType_t datatype) { - size_t weight_size; + cudnnDataType_t datatype) { + size_t weight_size; #ifndef USE_CUDNN_RNN_V8_API - AT_CUDNN_CHECK(cudnnGetRNNParamsSize(handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype)); + AT_CUDNN_CHECK(cudnnGetRNNParamsSize( + handle, rnn_desc.desc(), x_desc.desc(), &weight_size, datatype)); #else - AT_CUDNN_CHECK(cudnnGetRNNWeightSpaceSize(handle, rnn_desc.desc(), &weight_size)); + AT_CUDNN_CHECK( + cudnnGetRNNWeightSpaceSize(handle, rnn_desc.desc(), &weight_size)); #endif - auto elem_size = dataSize(datatype); - TORCH_INTERNAL_ASSERT(weight_size % elem_size == 0, "cudnnGetRNNParamsSize returned nonsensical weight_size"); - return weight_size / elem_size; - } + auto elem_size = dataSize(datatype); + TORCH_INTERNAL_ASSERT( + weight_size % elem_size == 0, + "cudnnGetRNNParamsSize returned nonsensical weight_size"); + return weight_size / elem_size; +} - int64_t _num_linear_layers(cudnnRNNMode_t mode) { - switch(mode) { - case CUDNN_LSTM: - return 8; - case CUDNN_GRU: - return 6; - case CUDNN_RNN_RELU: - return 2; - case CUDNN_RNN_TANH: - return 2; - default: - AT_ERROR("unknown cuDNN RNN mode ", mode); - } +int64_t _num_linear_layers(cudnnRNNMode_t mode) { + switch (mode) { + case CUDNN_LSTM: + return 8; + case CUDNN_GRU: + return 6; + case CUDNN_RNN_RELU: + return 2; + case CUDNN_RNN_TANH: + return 2; + default: + AT_ERROR("unknown cuDNN RNN mode ", mode); } +} - void add_projection_weights( - cudnnHandle_t handle, - const RNNDescriptor& rnn_desc, +void add_projection_weights( + cudnnHandle_t handle, + const RNNDescriptor& rnn_desc, #ifndef USE_CUDNN_RNN_V8_API - const TensorDescriptor& x_desc, - const FilterDescriptor& w_desc, + const TensorDescriptor& x_desc, + const FilterDescriptor& w_desc, #endif - const Tensor& weight_buf, - int64_t layer, - std::vector& params - ) { - void* matrix_pointer = nullptr; - // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases) - int64_t linear_id = 8; + const Tensor& weight_buf, + int64_t layer, + std::vector& params) { + void* matrix_pointer = nullptr; + // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 + // biases) + int64_t linear_id = 8; #ifndef USE_CUDNN_RNN_V8_API - FilterDescriptor lin_layer_mat_desc; - AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams( - /*handle=*/handle, - /*rnnDesc=*/rnn_desc.desc(), - /*layer=*/layer, - /*xDesc=*/x_desc.desc(), - /*wDesc=*/w_desc.desc(), - /*w=*/weight_buf.data_ptr(), - /*linLayerID=*/linear_id, - /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(), - /*linLayerMat=*/&matrix_pointer)); + FilterDescriptor lin_layer_mat_desc; + AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams( + /*handle=*/handle, + /*rnnDesc=*/rnn_desc.desc(), + /*layer=*/layer, + /*xDesc=*/x_desc.desc(), + /*wDesc=*/w_desc.desc(), + /*w=*/weight_buf.data_ptr(), + /*linLayerID=*/linear_id, + /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(), + /*linLayerMat=*/&matrix_pointer)); #else - void *unused_pointer; - TensorDescriptor unused_desc; - TensorDescriptor lin_layer_mat_desc; - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( - /*handle=*/handle, - /*rnnDesc=*/rnn_desc.desc(), - /*layer=*/layer, - /*wDesc=*/weight_buf.numel() * weight_buf.element_size(), - /*w=*/weight_buf.data_ptr(), - /*linLayerID=*/linear_id, - /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(), - /*linLayerMat=*/&matrix_pointer, unused_desc.mut_desc(), &unused_pointer)); + void* unused_pointer; + TensorDescriptor unused_desc; + TensorDescriptor lin_layer_mat_desc; + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( + /*handle=*/handle, + /*rnnDesc=*/rnn_desc.desc(), + /*layer=*/layer, + /*wDesc=*/weight_buf.numel() * weight_buf.element_size(), + /*w=*/weight_buf.data_ptr(), + /*linLayerID=*/linear_id, + /*linLayerMatDesc=*/lin_layer_mat_desc.mut_desc(), + /*linLayerMat=*/&matrix_pointer, + unused_desc.mut_desc(), + &unused_pointer)); #endif - cudnnDataType_t data_type; + cudnnDataType_t data_type; #ifndef USE_CUDNN_RNN_V8_API - cudnnTensorFormat_t format; + cudnnTensorFormat_t format; #else - int stride_dim_a[5]; + int stride_dim_a[5]; #endif - int nb_dims; - constexpr int min_dim = 3; - int filter_dim_a[min_dim]; + int nb_dims; + constexpr int min_dim = 3; + int filter_dim_a[min_dim]; #ifndef USE_CUDNN_RNN_V8_API - AT_CUDNN_CHECK( - cudnnGetFilterNdDescriptor( - lin_layer_mat_desc.desc(), - min_dim, - &data_type, - &format, - &nb_dims, - filter_dim_a - )); + AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor( + lin_layer_mat_desc.desc(), + min_dim, + &data_type, + &format, + &nb_dims, + filter_dim_a)); #else - AT_CUDNN_CHECK( - cudnnGetTensorNdDescriptor( - lin_layer_mat_desc.desc(), - min_dim, - &data_type, - &nb_dims, - filter_dim_a, - stride_dim_a - )); + AT_CUDNN_CHECK(cudnnGetTensorNdDescriptor( + lin_layer_mat_desc.desc(), + min_dim, + &data_type, + &nb_dims, + filter_dim_a, + stride_dim_a)); #endif - TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim); - auto elem_size = dataSize(getCudnnDataType(weight_buf)); - auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr(); - TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size); - size_t offset = offset_bytes / elem_size; - - int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims); - // Generate a new parameter tensor which is a view into the weight_buf. - std::initializer_list size = {mat_numel, 1}; - Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size); - params.emplace_back(std::move(param)); - } - + TORCH_INTERNAL_ASSERT( + nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim); + auto elem_size = dataSize(getCudnnDataType(weight_buf)); + auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr(); + TORCH_INTERNAL_ASSERT( + offset_bytes % elem_size == 0, + "offset_bytes = ", + offset_bytes, + "; elem_size = ", + elem_size); + size_t offset = offset_bytes / elem_size; + + int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims); + // Generate a new parameter tensor which is a view into the weight_buf. + std::initializer_list size = {mat_numel, 1}; + Tensor param = at::empty({0}, weight_buf.options()) + .set_(weight_buf.storage(), offset, size); + params.emplace_back(std::move(param)); +} - /* - Returns weight and bias tensors for each layer of the RNN. These tensors - are views on the underlying weight buffer allocated by CuDNN. - - Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, respectively), - these parameters are concatenated along the first dimension. - These parameters are returned in a consistent order by CuDNN: - (reset, forget, cell, output) for LSTM - (reset, input, new) for GRU - Args: - fn: The RNN function object holding the RNN state - handle: a CuDNN handle - weight_buf: a 1D tensor containing the CuDNN-allocated weight (or grad_weight) buffer - Returns: - parameters: [(weight_ih, weight_hh, bias_ih, bias_hh)*], with length equal to the num_layers. - This is represented as a pair of vector, and outer-dimension stride - (NB: Can't return MatrixRef because we need to allocate the underlying tensor) - */ - std::pair, size_t> // stride0 - get_parameters( - cudnnHandle_t handle, - const RNNDescriptorParams& rnn, - const RNNDescriptor& rnn_desc, +/* + Returns weight and bias tensors for each layer of the RNN. These tensors + are views on the underlying weight buffer allocated by CuDNN. + + Note: for LSTM and GRU, which have multiple parameters of each type (4 and 3, + respectively), these parameters are concatenated along the first dimension. + These parameters are returned in a consistent order by CuDNN: + (reset, forget, cell, output) for LSTM + (reset, input, new) for GRU + Args: + fn: The RNN function object holding the RNN state + handle: a CuDNN handle + weight_buf: a 1D tensor containing the CuDNN-allocated weight (or + grad_weight) buffer Returns: parameters: [(weight_ih, weight_hh, bias_ih, + bias_hh)*], with length equal to the num_layers. This is represented as a pair + of vector, and outer-dimension stride (NB: Can't return MatrixRef because we + need to allocate the underlying tensor) +*/ +std::pair, size_t> // stride0 +get_parameters( + cudnnHandle_t handle, + const RNNDescriptorParams& rnn, + const RNNDescriptor& rnn_desc, #ifndef USE_CUDNN_RNN_V8_API - const TensorDescriptor& x_desc, - const FilterDescriptor& w_desc, + const TensorDescriptor& x_desc, + const FilterDescriptor& w_desc, #endif - const Tensor& weight_buf, - bool include_bias=true - ) { + const Tensor& weight_buf, + bool include_bias = true) { #ifndef USE_CUDNN_RNN_V8_API - auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams }; + auto cudnn_methods = { + cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams}; #else - auto cudnn_methods = { true, false }; + auto cudnn_methods = {true, false}; #endif - std::vector params; - int64_t num_linear_layers = _num_linear_layers(rnn.mode); - int64_t num_layers = rnn.num_directions() * rnn.num_layers; - size_t cur_offset = 0; - size_t global_layer_params_count = 0; - for (const auto layer : c10::irange(num_layers)) { - size_t layer_params_count = 0; - for (auto cudnn_method : cudnn_methods) { - for (const auto linear_id : c10::irange(num_linear_layers)) { - void* matrix_pointer; + std::vector params; + int64_t num_linear_layers = _num_linear_layers(rnn.mode); + int64_t num_layers = rnn.num_directions() * rnn.num_layers; + size_t cur_offset = 0; + size_t global_layer_params_count = 0; + for (const auto layer : c10::irange(num_layers)) { + size_t layer_params_count = 0; + for (auto cudnn_method : cudnn_methods) { + for (const auto linear_id : c10::irange(num_linear_layers)) { + void* matrix_pointer; #ifndef USE_CUDNN_RNN_V8_API - FilterDescriptor lin_layer_mat_desc; - AT_CUDNN_CHECK(cudnn_method( + FilterDescriptor lin_layer_mat_desc; + AT_CUDNN_CHECK(cudnn_method( + handle, + rnn_desc.desc(), + layer, + x_desc.desc(), + w_desc.desc(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer)); +#else + void* unused_pointer = nullptr; + TensorDescriptor unused_desc; + TensorDescriptor lin_layer_mat_desc; + for (int stateless = 0; stateless < 100; stateless++) { + if (cudnn_method) { // matrix + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( handle, rnn_desc.desc(), layer, - x_desc.desc(), - w_desc.desc(), + weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), linear_id, lin_layer_mat_desc.mut_desc(), - &matrix_pointer - )); -#else - void *unused_pointer = nullptr; - TensorDescriptor unused_desc; - TensorDescriptor lin_layer_mat_desc; - for (int stateless = 0; stateless < 100; stateless++) { - if (cudnn_method) { // matrix - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( - handle, - rnn_desc.desc(), - layer, - weight_buf.numel() * weight_buf.element_size(), - weight_buf.data_ptr(), - linear_id, - lin_layer_mat_desc.mut_desc(), - &matrix_pointer, - unused_desc.mut_desc(), - &unused_pointer - )); + &matrix_pointer, + unused_desc.mut_desc(), + &unused_pointer)); } else { // bias - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( - handle, - rnn_desc.desc(), - layer, - weight_buf.numel() * weight_buf.element_size(), - weight_buf.data_ptr(), - linear_id, - unused_desc.mut_desc(), - &unused_pointer, - lin_layer_mat_desc.mut_desc(), - &matrix_pointer - )); - } + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( + handle, + rnn_desc.desc(), + layer, + weight_buf.numel() * weight_buf.element_size(), + weight_buf.data_ptr(), + linear_id, + unused_desc.mut_desc(), + &unused_pointer, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer)); } + } #endif - cudnnDataType_t data_type; + cudnnDataType_t data_type; #ifndef USE_CUDNN_RNN_V8_API - cudnnTensorFormat_t format; + cudnnTensorFormat_t format; #else - int stride_dim_a[5]; + int stride_dim_a[5]; #endif - int nb_dims; - constexpr int min_dim = 3; - int filter_dim_a[min_dim]; + int nb_dims; + constexpr int min_dim = 3; + int filter_dim_a[min_dim]; #ifndef USE_CUDNN_RNN_V8_API - AT_CUDNN_CHECK( - cudnnGetFilterNdDescriptor( - lin_layer_mat_desc.desc(), - min_dim, - &data_type, - &format, - &nb_dims, - filter_dim_a - )); + AT_CUDNN_CHECK(cudnnGetFilterNdDescriptor( + lin_layer_mat_desc.desc(), + min_dim, + &data_type, + &format, + &nb_dims, + filter_dim_a)); #else - AT_CUDNN_CHECK( - cudnnGetTensorNdDescriptor( - lin_layer_mat_desc.desc(), - min_dim, - &data_type, - &nb_dims, - filter_dim_a, - stride_dim_a - )); + AT_CUDNN_CHECK(cudnnGetTensorNdDescriptor( + lin_layer_mat_desc.desc(), + min_dim, + &data_type, + &nb_dims, + filter_dim_a, + stride_dim_a)); #endif - TORCH_INTERNAL_ASSERT(nb_dims <= min_dim, "nb_dims = ", nb_dims, "; min_dim = ", min_dim); - auto elem_size = dataSize(getCudnnDataType(weight_buf)); - auto offset_bytes = (char*)matrix_pointer - (char*)weight_buf.data_ptr(); - TORCH_INTERNAL_ASSERT(offset_bytes % elem_size == 0, "offset_bytes = ", offset_bytes, "; elem_size = ", elem_size); - size_t offset = offset_bytes / elem_size; - // for all the RNN types provided by CUDNN, all the ih weights - // are the same size and are allocated in a contiguous chunk - // (same for the hh weights, and the ih and hh biases). - // Since we're storing all the weights in a single tensor anyway, - // might as well merge the CUDNN ones into a single tensor as well - int mat_numel = c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims); - if (linear_id == 0 || linear_id == num_linear_layers / 2) { - // We could also exclude bias params by restricting cudnn_methods to just { cudnnGetRNNLinLayerMatrixParams } - // at the very top. However, to do so would throw off the cur_offset account, which is currently a strict - // and informative check that all params are laid out the way we think they are. If include_bias is false, - // I'd rather keep full cur_offset checks rather than save some CPU overhead by skipping the cudnn_method = - // cudnnGetRNNLinLayerBiasParams iteration. + TORCH_INTERNAL_ASSERT( + nb_dims <= min_dim, + "nb_dims = ", + nb_dims, + "; min_dim = ", + min_dim); + auto elem_size = dataSize(getCudnnDataType(weight_buf)); + auto offset_bytes = + (char*)matrix_pointer - (char*)weight_buf.data_ptr(); + TORCH_INTERNAL_ASSERT( + offset_bytes % elem_size == 0, + "offset_bytes = ", + offset_bytes, + "; elem_size = ", + elem_size); + size_t offset = offset_bytes / elem_size; + // for all the RNN types provided by CUDNN, all the ih weights + // are the same size and are allocated in a contiguous chunk + // (same for the hh weights, and the ih and hh biases). + // Since we're storing all the weights in a single tensor anyway, + // might as well merge the CUDNN ones into a single tensor as well + int mat_numel = + c10::multiply_integers(filter_dim_a, filter_dim_a + nb_dims); + if (linear_id == 0 || linear_id == num_linear_layers / 2) { + // We could also exclude bias params by restricting cudnn_methods to + // just { cudnnGetRNNLinLayerMatrixParams } at the very top. However, + // to do so would throw off the cur_offset account, which is currently + // a strict and informative check that all params are laid out the way + // we think they are. If include_bias is false, I'd rather keep full + // cur_offset checks rather than save some CPU overhead by skipping + // the cudnn_method = cudnnGetRNNLinLayerBiasParams iteration. #ifndef USE_CUDNN_RNN_V8_API - if (include_bias || cudnn_method != cudnnGetRNNLinLayerBiasParams) { + if (include_bias || cudnn_method != cudnnGetRNNLinLayerBiasParams) { #else - if (include_bias || cudnn_method) { + if (include_bias || cudnn_method) { #endif - // Generate a new parameter tensor which is a view into the weight_buf. - std::initializer_list size = { + // Generate a new parameter tensor which is a view into the + // weight_buf. + std::initializer_list size = { mat_numel * num_linear_layers / 2, 1}; - Tensor param = at::empty({0}, weight_buf.options()).set_(weight_buf.storage(), offset, size); - params.emplace_back(std::move(param)); - layer_params_count++; - } - } else { - TORCH_INTERNAL_ASSERT(cur_offset == offset, "cur_offset = ", cur_offset, "; offset = ", offset); + Tensor param = at::empty({0}, weight_buf.options()) + .set_(weight_buf.storage(), offset, size); + params.emplace_back(std::move(param)); + layer_params_count++; } - cur_offset = offset + mat_numel; + } else { + TORCH_INTERNAL_ASSERT( + cur_offset == offset, + "cur_offset = ", + cur_offset, + "; offset = ", + offset); } - } // for cudnn_method - if (rnn.proj_size != 0) { + cur_offset = offset + mat_numel; + } + } // for cudnn_method + if (rnn.proj_size != 0) { #ifndef USE_CUDNN_RNN_V8_API - add_projection_weights(handle, rnn_desc, x_desc, w_desc, weight_buf, layer, params); + add_projection_weights( + handle, rnn_desc, x_desc, w_desc, weight_buf, layer, params); #else - add_projection_weights(handle, rnn_desc, weight_buf, layer, params); + add_projection_weights(handle, rnn_desc, weight_buf, layer, params); #endif - layer_params_count++; - } + layer_params_count++; + } - if (layer == 0) { - global_layer_params_count = layer_params_count; - } else { - TORCH_INTERNAL_ASSERT(global_layer_params_count == layer_params_count, - "global_layer_params_count = ", global_layer_params_count, - "; layer_params_count = ", layer_params_count); - } - } // for layer - return std::make_pair(params, global_layer_params_count); - } + if (layer == 0) { + global_layer_params_count = layer_params_count; + } else { + TORCH_INTERNAL_ASSERT( + global_layer_params_count == layer_params_count, + "global_layer_params_count = ", + global_layer_params_count, + "; layer_params_count = ", + layer_params_count); + } + } // for layer + return std::make_pair(params, global_layer_params_count); +} - // This is a lightweight version of the method above used to quickly get the expected - // parameter offsets. - std::vector get_expected_data_ptrs( - const Tensor& weight_buf, cudnnHandle_t handle, const RNNDescriptorParams& rnn, - const RNNDescriptor& rnn_desc, const TensorDescriptor& x_desc, cudnnDataType_t datatype) { +// This is a lightweight version of the method above used to quickly get the +// expected parameter offsets. +std::vector get_expected_data_ptrs( + const Tensor& weight_buf, + cudnnHandle_t handle, + const RNNDescriptorParams& rnn, + const RNNDescriptor& rnn_desc, + const TensorDescriptor& x_desc, + cudnnDataType_t datatype) { #ifndef USE_CUDNN_RNN_V8_API - FilterDescriptor w_desc; - w_desc.set(weight_buf, 3); + FilterDescriptor w_desc; + w_desc.set(weight_buf, 3); #endif - int64_t num_linear_layers = _num_linear_layers(rnn.mode); - int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers; + int64_t num_linear_layers = _num_linear_layers(rnn.mode); + int64_t num_dir_layers = rnn.num_directions() * rnn.num_layers; #ifndef USE_CUDNN_RNN_V8_API - const auto cudnn_methods = { cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams }; + const auto cudnn_methods = { + cudnnGetRNNLinLayerMatrixParams, cudnnGetRNNLinLayerBiasParams}; #else - const auto cudnn_methods = { true, false }; + const auto cudnn_methods = {true, false}; #endif - std::vector data_ptrs; - if (rnn.proj_size != 0) { - data_ptrs.reserve(num_dir_layers * (2 * 2 + 1)); - } else { - data_ptrs.reserve(num_dir_layers * 2 * 2); - } - for (const auto layer : c10::irange(num_dir_layers)) { - for (auto cudnn_method : cudnn_methods) { - // This API returns a separate pointer for weight of every gate, - // but we represent them as a single tensor, so we're only interested - // in a very limited subset of possible values. - const std::array linear_offsets = { 0, num_linear_layers / 2 }; - for (int64_t linear_id : linear_offsets) { - void* matrix_pointer; + std::vector data_ptrs; + if (rnn.proj_size != 0) { + data_ptrs.reserve(num_dir_layers * (2 * 2 + 1)); + } else { + data_ptrs.reserve(num_dir_layers * 2 * 2); + } + for (const auto layer : c10::irange(num_dir_layers)) { + for (auto cudnn_method : cudnn_methods) { + // This API returns a separate pointer for weight of every gate, + // but we represent them as a single tensor, so we're only interested + // in a very limited subset of possible values. + const std::array linear_offsets = {0, num_linear_layers / 2}; + for (int64_t linear_id : linear_offsets) { + void* matrix_pointer; #ifndef USE_CUDNN_RNN_V8_API - FilterDescriptor lin_layer_mat_desc; - AT_CUDNN_CHECK(cudnn_method( - handle, - rnn_desc.desc(), - layer, - x_desc.desc(), - w_desc.desc(), - weight_buf.data_ptr(), - linear_id, - lin_layer_mat_desc.mut_desc(), - &matrix_pointer - )); + FilterDescriptor lin_layer_mat_desc; + AT_CUDNN_CHECK(cudnn_method( + handle, + rnn_desc.desc(), + layer, + x_desc.desc(), + w_desc.desc(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer)); #else - void *unused_pointer = nullptr; + void* unused_pointer = nullptr; TensorDescriptor unused_desc; TensorDescriptor lin_layer_mat_desc; - if (cudnn_method) { // matrix - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( - handle, - rnn_desc.desc(), - layer, - weight_buf.numel() * weight_buf.element_size(), - weight_buf.data_ptr(), - linear_id, - lin_layer_mat_desc.mut_desc(), - &matrix_pointer, - unused_desc.mut_desc(), - &unused_pointer - )); - } else { // bias - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( - handle, - rnn_desc.desc(), - layer, - weight_buf.numel() * weight_buf.element_size(), - weight_buf.data_ptr(), - linear_id, - unused_desc.mut_desc(), - &unused_pointer, - lin_layer_mat_desc.mut_desc(), - &matrix_pointer - )); - } -#endif - data_ptrs.push_back(matrix_pointer); - } - } - if (rnn.proj_size != 0) { - // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 biases) - int64_t linear_id = 8; - void* matrix_pointer; -#ifndef USE_CUDNN_RNN_V8_API - FilterDescriptor lin_layer_mat_desc; - AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams( + if (cudnn_method) { // matrix + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( handle, rnn_desc.desc(), layer, - x_desc.desc(), - w_desc.desc(), + weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), linear_id, lin_layer_mat_desc.mut_desc(), - &matrix_pointer - )); -#else - void *unused_pointer; - TensorDescriptor unused_desc; - TensorDescriptor lin_layer_mat_desc; - - AT_CUDNN_CHECK(cudnnGetRNNWeightParams( + &matrix_pointer, + unused_desc.mut_desc(), + &unused_pointer)); + } else { // bias + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( handle, rnn_desc.desc(), layer, weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), linear_id, + unused_desc.mut_desc(), + &unused_pointer, lin_layer_mat_desc.mut_desc(), - &matrix_pointer, - unused_desc.mut_desc(), &unused_pointer)); + &matrix_pointer)); + } #endif data_ptrs.push_back(matrix_pointer); } } - return data_ptrs; - } + if (rnn.proj_size != 0) { + // assuming it's LSTM which has 8 "linear layers" (i.e. 4 weights and 4 + // biases) + int64_t linear_id = 8; + void* matrix_pointer; +#ifndef USE_CUDNN_RNN_V8_API + FilterDescriptor lin_layer_mat_desc; + AT_CUDNN_CHECK(cudnnGetRNNLinLayerMatrixParams( + handle, + rnn_desc.desc(), + layer, + x_desc.desc(), + w_desc.desc(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer)); +#else + void* unused_pointer; + TensorDescriptor unused_desc; + TensorDescriptor lin_layer_mat_desc; - void _viewOrCopyOneParam(const Tensor& param_from, const Tensor& param_to, - bool copy, bool allow_type_change=false) { - // if copying, allow_type_change may be true or false. - // if viewing, allow_type_change must be false. - TORCH_INTERNAL_ASSERT(copy || !allow_type_change, - "if viewing, type change is not allowed."); - TORCH_INTERNAL_ASSERT(allow_type_change || (param_from.scalar_type() == param_to.scalar_type()), - "parameter types mismatch"); - if (copy) { - param_to.copy_(param_from.view_as(param_to)); - } else { - param_from.resize_as_(param_to); + AT_CUDNN_CHECK(cudnnGetRNNWeightParams( + handle, + rnn_desc.desc(), + layer, + weight_buf.numel() * weight_buf.element_size(), + weight_buf.data_ptr(), + linear_id, + lin_layer_mat_desc.mut_desc(), + &matrix_pointer, + unused_desc.mut_desc(), + &unused_pointer)); +#endif + data_ptrs.push_back(matrix_pointer); } } + return data_ptrs; +} - void _viewOrCopyParams(MatrixRef params_from, MatrixRef params_to, - bool copy, bool allow_type_change=false) { - TORCH_INTERNAL_ASSERT(params_from.size(0) == params_to.size(0), "number of layers mismatch"); - for (const auto i : c10::irange(params_from.size(0))) { - auto layer_params_from = params_from[i]; - auto layer_params_to = params_to[i]; - // NOTE: these lists have all weights before all biases, so if the layer - // doesn't use biases, iteration will terminate once layer_params_from ends - // and ignore them. - - // NOTE: there is an exception from the above statement. If LSTMs with projections - // are used, weights layout will be w_ih, w_hh, b_ih, b_hh, w_hr. So need to handle no-bias - // case specially, because will need to copy 0->0, 1->1, 2->4. This case can be uniquely - // identified by checking if number of defined parameters for each layer is 3. - if (layer_params_from.size() == 3 && layer_params_to.size() != 3) { - _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change); - _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change); - _viewOrCopyOneParam(layer_params_from[2], layer_params_to[4], copy, allow_type_change); - continue; - } - if (layer_params_to.size() == 3 && layer_params_from.size() != 3) { - _viewOrCopyOneParam(layer_params_from[0], layer_params_to[0], copy, allow_type_change); - _viewOrCopyOneParam(layer_params_from[1], layer_params_to[1], copy, allow_type_change); - _viewOrCopyOneParam(layer_params_from[4], layer_params_to[2], copy, allow_type_change); - continue; - } - for (auto a = layer_params_from.begin(), b = layer_params_to.begin(); - a != layer_params_from.end() && b != layer_params_to.end(); - ++a, ++b) { - _viewOrCopyOneParam(*a, *b, copy, allow_type_change); - } - } +void _viewOrCopyOneParam( + const Tensor& param_from, + const Tensor& param_to, + bool copy, + bool allow_type_change = false) { + // if copying, allow_type_change may be true or false. + // if viewing, allow_type_change must be false. + TORCH_INTERNAL_ASSERT( + copy || !allow_type_change, "if viewing, type change is not allowed."); + TORCH_INTERNAL_ASSERT( + allow_type_change || (param_from.scalar_type() == param_to.scalar_type()), + "parameter types mismatch"); + if (copy) { + param_to.copy_(param_from.view_as(param_to)); + } else { + param_from.resize_as_(param_to); } +} - void _copyParams(MatrixRef params_from, MatrixRef params_to) { - _viewOrCopyParams(params_from, params_to, true); +void _viewOrCopyParams( + MatrixRef params_from, + MatrixRef params_to, + bool copy, + bool allow_type_change = false) { + TORCH_INTERNAL_ASSERT( + params_from.size(0) == params_to.size(0), "number of layers mismatch"); + for (const auto i : c10::irange(params_from.size(0))) { + auto layer_params_from = params_from[i]; + auto layer_params_to = params_to[i]; + // NOTE: these lists have all weights before all biases, so if the layer + // doesn't use biases, iteration will terminate once layer_params_from ends + // and ignore them. + + // NOTE: there is an exception from the above statement. If LSTMs with + // projections are used, weights layout will be w_ih, w_hh, b_ih, b_hh, + // w_hr. So need to handle no-bias case specially, because will need to copy + // 0->0, 1->1, 2->4. This case can be uniquely identified by checking if + // number of defined parameters for each layer is 3. + if (layer_params_from.size() == 3 && layer_params_to.size() != 3) { + _viewOrCopyOneParam( + layer_params_from[0], layer_params_to[0], copy, allow_type_change); + _viewOrCopyOneParam( + layer_params_from[1], layer_params_to[1], copy, allow_type_change); + _viewOrCopyOneParam( + layer_params_from[2], layer_params_to[4], copy, allow_type_change); + continue; + } + if (layer_params_to.size() == 3 && layer_params_from.size() != 3) { + _viewOrCopyOneParam( + layer_params_from[0], layer_params_to[0], copy, allow_type_change); + _viewOrCopyOneParam( + layer_params_from[1], layer_params_to[1], copy, allow_type_change); + _viewOrCopyOneParam( + layer_params_from[4], layer_params_to[2], copy, allow_type_change); + continue; + } + for (auto a = layer_params_from.begin(), b = layer_params_to.begin(); + a != layer_params_from.end() && b != layer_params_to.end(); + ++a, ++b) { + _viewOrCopyOneParam(*a, *b, copy, allow_type_change); + } } +} - void _viewParams(MatrixRef params_from, MatrixRef params_to) { - _viewOrCopyParams(params_from, params_to, false); - } +void _copyParams(MatrixRef params_from, MatrixRef params_to) { + _viewOrCopyParams(params_from, params_to, true); +} +void _viewParams(MatrixRef params_from, MatrixRef params_to) { + _viewOrCopyParams(params_from, params_to, false); +} - std::vector _input_size(const TensorDescriptorListParams& tensors) { - if (tensors.is_input_packed()) { - return {tensors.batch_sizes_sum, tensors.input_size}; - } else { - return {tensors.seq_length, tensors.mini_batch, tensors.input_size}; - } +std::vector _input_size(const TensorDescriptorListParams& tensors) { + if (tensors.is_input_packed()) { + return {tensors.batch_sizes_sum, tensors.input_size}; + } else { + return {tensors.seq_length, tensors.mini_batch, tensors.input_size}; } +} - std::vector _hidden_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) { - if (rnn.proj_size != 0) { - return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.proj_size}; - } else { - return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size}; - } +std::vector _hidden_size( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + if (rnn.proj_size != 0) { + return { + rnn.num_layers * rnn.num_directions(), + tensors.mini_batch, + rnn.proj_size}; + } else { + return { + rnn.num_layers * rnn.num_directions(), + tensors.mini_batch, + rnn.hidden_size}; } +} - std::vector _cell_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) { - return {rnn.num_layers * rnn.num_directions(), tensors.mini_batch, rnn.hidden_size}; - } +std::vector _cell_size( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + return { + rnn.num_layers * rnn.num_directions(), + tensors.mini_batch, + rnn.hidden_size}; +} - std::vector _output_size(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors) { - auto out_size = rnn.hidden_size; - if (rnn.proj_size != 0) { - out_size = rnn.proj_size; - } - if (tensors.is_input_packed()) { - return {tensors.batch_sizes_sum, out_size * rnn.num_directions()}; - } else { - return {tensors.seq_length, tensors.mini_batch, out_size * rnn.num_directions()}; - } +std::vector _output_size( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + auto out_size = rnn.hidden_size; + if (rnn.proj_size != 0) { + out_size = rnn.proj_size; } - - inline bool use_persist_common_heuristics(const RNNDescriptorParams& rnn, - const TensorDescriptorListParams& tensors) { - return rnn.num_layers == 1 && - rnn.hidden_size <= 1024 && - rnn.num_directions() == 1 && - rnn.hidden_size % 128 == 0 && - tensors.input_size % 128 == 0; + if (tensors.is_input_packed()) { + return {tensors.batch_sizes_sum, out_size * rnn.num_directions()}; + } else { + return { + tensors.seq_length, + tensors.mini_batch, + out_size * rnn.num_directions()}; } +} - inline bool use_persist_device_heuristics(const RNNDescriptorParams& rnn, - const TensorDescriptorListParams& tensors) { - auto bsize = tensors.mini_batch; - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - if (prop->major == 7) { - if (prop->minor == 5) { - // Excludes Turing from using persistent rnn. - return false; - } else { - // technically, batch size should be multiple of 8, but there are quite a few multiple-of-8 batchsizes that give bad perf, - // weed them out - return ((bsize % 16 == 0 && bsize != 80 && bsize !=112) || bsize == 8) && - ((tensors.seq_length >=40 && bsize <=128) || - (tensors.seq_length >=20 && bsize <=96) || - (tensors.seq_length >=10 && bsize <=32)); - } - } else if (prop->major >= 8 && prop->multiProcessorCount >= 98) { - // SM count check excludes A30 (similar issue to A40) - if (prop->minor == 6) { - // Excludes sm_86 GPU devices from using persistent rnn. - // This is because there are some edge cases that will throw exceptions with cudnn 8.0.5 on Nvidia A40 GPU. - return false; - } - // Based on tests by Vasily Volkov and xwang233. Vasily only tried bsize <= 128, - // so conservatively enable persistence for bsize <= 128 only. - // TODO: Run more tests for bsize > 128. - if (rnn.mode == CUDNN_GRU) { - // Persistent GRU performance is flakier than other RNN types. Exclude them for now. - // TODO: Write a more refined GRU heuristic. - return false; - } else if (rnn.mode == CUDNN_LSTM) { - // Persistent LSTMs are comparable to or better than non-persistent for bsize <= 128. - return (bsize % 8 == 0) && (bsize <= 128); - } else { - // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND hidden size >= 896. - return (bsize % 8 == 0) && (bsize <= 128) && (bsize < 96 || rnn.hidden_size < 896); - } +inline bool use_persist_common_heuristics( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + return rnn.num_layers == 1 && rnn.hidden_size <= 1024 && + rnn.num_directions() == 1 && rnn.hidden_size % 128 == 0 && + tensors.input_size % 128 == 0; +} + +inline bool use_persist_device_heuristics( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors) { + auto bsize = tensors.mini_batch; + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major == 7) { + if (prop->minor == 5) { + // Excludes Turing from using persistent rnn. + return false; } else { + // technically, batch size should be multiple of 8, but there are quite a + // few multiple-of-8 batchsizes that give bad perf, weed them out + return ((bsize % 16 == 0 && bsize != 80 && bsize != 112) || bsize == 8) && + ((tensors.seq_length >= 40 && bsize <= 128) || + (tensors.seq_length >= 20 && bsize <= 96) || + (tensors.seq_length >= 10 && bsize <= 32)); + } + } else if (prop->major >= 8 && prop->multiProcessorCount >= 98) { + // SM count check excludes A30 (similar issue to A40) + if (prop->minor == 6) { + // Excludes sm_86 GPU devices from using persistent rnn. + // This is because there are some edge cases that will throw exceptions + // with cudnn 8.0.5 on Nvidia A40 GPU. return false; } + // Based on tests by Vasily Volkov and xwang233. Vasily only tried bsize <= + // 128, so conservatively enable persistence for bsize <= 128 only. + // TODO: Run more tests for bsize > 128. + if (rnn.mode == CUDNN_GRU) { + // Persistent GRU performance is flakier than other RNN types. Exclude + // them for now. + // TODO: Write a more refined GRU heuristic. + return false; + } else if (rnn.mode == CUDNN_LSTM) { + // Persistent LSTMs are comparable to or better than non-persistent for + // bsize <= 128. + return (bsize % 8 == 0) && (bsize <= 128); + } else { + // Persistent RNN_RELU and TANH show poor performance when bsize >= 96 AND + // hidden size >= 896. + return (bsize % 8 == 0) && (bsize <= 128) && + (bsize < 96 || rnn.hidden_size < 896); + } + } else { + return false; } +} - inline bool use_rnn_persist_small_h(const RNNDescriptorParams& rnn, - const TensorDescriptorListParams& tensors, - bool forward) { +inline bool use_rnn_persist_small_h( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors, + bool forward) { #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8201 // 8.2.1 - cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); - if (prop->major < 6) return false; + cudaDeviceProp* prop = at::cuda::getCurrentDeviceProperties(); + if (prop->major < 6) + return false; - if (forward) { - if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { - return rnn.hidden_size <= 384; - } - if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { - return rnn.hidden_size <= 192; - } - } else /* backward */ { - if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { - return rnn.hidden_size <= 256; - } - if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { - return rnn.hidden_size <= 128; - } + if (forward) { + if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { + return rnn.hidden_size <= 384; + } + if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { + return rnn.hidden_size <= 192; } + } else /* backward */ { + if (rnn.mode == CUDNN_RNN_RELU || rnn.mode == CUDNN_RNN_TANH) { + return rnn.hidden_size <= 256; + } + if (rnn.mode == CUDNN_LSTM || rnn.mode == CUDNN_GRU) { + return rnn.hidden_size <= 128; + } + } - return false; + return false; #else - return false; + return false; #endif - } +} - cudnnRNNAlgo_t get_algo(const RNNDescriptorParams& rnn, const TensorDescriptorListParams& tensors, const Tensor input, bool forward) { - // LSTM with projections only works with standard algorithm - if (rnn.proj_size != 0) { - return CUDNN_RNN_ALGO_STANDARD; - } +cudnnRNNAlgo_t get_algo( + const RNNDescriptorParams& rnn, + const TensorDescriptorListParams& tensors, + const Tensor input, + bool forward) { + // LSTM with projections only works with standard algorithm + if (rnn.proj_size != 0) { + return CUDNN_RNN_ALGO_STANDARD; + } - // Persistent algos typically don't work for packed inputs with sequence lengths that vary - // across batch elements, and will return CUDNN_STATUS_NOT_SUPPORTED if attempted. See - // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions - if (!tensors.is_input_packed()) { - auto cudnnDataType = getCudnnDataType(input); + // Persistent algos typically don't work for packed inputs with sequence + // lengths that vary across batch elements, and will return + // CUDNN_STATUS_NOT_SUPPORTED if attempted. See + // https://docs.nvidia.com/deeplearning/cudnn/developer-guide/index.html#features-of-rnn-functions + if (!tensors.is_input_packed()) { + auto cudnnDataType = getCudnnDataType(input); #if defined(CUDNN_VERSION) && CUDNN_VERSION >= 8201 // 8.2.1 - if (cudnnDataType != CUDNN_DATA_DOUBLE) { - if (use_rnn_persist_small_h(rnn, tensors, forward)) { - return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H; - } + if (cudnnDataType != CUDNN_DATA_DOUBLE) { + if (use_rnn_persist_small_h(rnn, tensors, forward)) { + return CUDNN_RNN_ALGO_PERSIST_STATIC_SMALL_H; } + } #endif - if (cudnnDataType == CUDNN_DATA_HALF) { - if (use_persist_common_heuristics(rnn, tensors) && - use_persist_device_heuristics(rnn, tensors)) { - return CUDNN_RNN_ALGO_PERSIST_STATIC; - } + if (cudnnDataType == CUDNN_DATA_HALF) { + if (use_persist_common_heuristics(rnn, tensors) && + use_persist_device_heuristics(rnn, tensors)) { + return CUDNN_RNN_ALGO_PERSIST_STATIC; } } - - return CUDNN_RNN_ALGO_STANDARD; } - cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) { - if (dtype == CUDNN_DATA_HALF) { - return CUDNN_DATA_FLOAT; - } - return dtype; + return CUDNN_RNN_ALGO_STANDARD; +} + +cudnnDataType_t promote_rnn_math_type(cudnnDataType_t dtype) { + if (dtype == CUDNN_DATA_HALF) { + return CUDNN_DATA_FLOAT; } + return dtype; +} -} // anonymous namespace +} // namespace native // Utilities exposed in RNNUtils.h namespace cudnn_rnn { @@ -1097,7 +1281,8 @@ copy_weights_to_flat_buf_views( #ifdef USE_CUDNN_RNN_V8_API input_size, false, // eqy: bogus as we do not know if the input is packed here - // but it should not affect the weights (what are are interested in) + // but it should not affect the weights (what are are interested + // in) #endif hidden_size, proj_size, @@ -1130,9 +1315,7 @@ copy_weights_to_flat_buf_views( #endif // Slice off views into weight_buf - std::vector params_arr; - size_t params_stride0; - std::tie(params_arr, params_stride0) = get_parameters( + auto [params_arr, params_stride0] = get_parameters( #ifndef USE_CUDNN_RNN_V8_API handle, rnn, rnn_desc, x_desc, w_desc, weight_buf, include_bias); #else @@ -1177,12 +1360,15 @@ using namespace cudnn_rnn; // functions, only one of which does an inplace update, but we leave this // for future work Tensor _cudnn_rnn_flatten_weight( - TensorList weight_arr, int64_t weight_stride0, + TensorList weight_arr, + int64_t weight_stride0, int64_t input_size, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, - bool fn_bidirectional - ) { + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + bool fn_bidirectional) { // returns flat weight_buf return std::get<0>(copy_weights_to_flat_buf_views( weight_arr, @@ -1199,24 +1385,37 @@ Tensor _cudnn_rnn_flatten_weight( /*set_orig_weights_to_flat_buf=*/true)); } -const char * WEIGHT_FORMAT_WARN = "RNN module weights are not part of single contiguous " - "chunk of memory. This means they need to be compacted " - "at every call, possibly greatly increasing memory usage. " - "To compact weights again call flatten_parameters()."; +const char* WEIGHT_FORMAT_WARN = + "RNN module weights are not part of single contiguous " + "chunk of memory. This means they need to be compacted " + "at every call, possibly greatly increasing memory usage. " + "To compact weights again call flatten_parameters()."; // NB: when fn_batch_sizes is empty, that means no batch sizes was specified std::tuple _cudnn_rnn( const Tensor& input_r, - TensorList weight, int64_t weight_stride0, const c10::optional& weight_buf_r_opt, const Tensor& hx, const c10::optional& cx_opt, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, double fn_dropout, - bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, const c10::optional& fn_dropout_state_opt - ) { + TensorList weight, + int64_t weight_stride0, + const c10::optional& weight_buf_r_opt, + const Tensor& hx, + const c10::optional& cx_opt, + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + double fn_dropout, + bool fn_train, + bool fn_bidirectional, + IntArrayRef fn_batch_sizes, + const c10::optional& fn_dropout_state_opt) { // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned weight_buf_r_maybe_owned = at::borrow_from_optional_tensor(weight_buf_r_opt); + c10::MaybeOwned weight_buf_r_maybe_owned = + at::borrow_from_optional_tensor(weight_buf_r_opt); const Tensor& weight_buf_r = *weight_buf_r_maybe_owned; - const Tensor& cx = c10::value_or_else(cx_opt, [] {return Tensor();}); - const Tensor& fn_dropout_state = c10::value_or_else(fn_dropout_state_opt, [] {return Tensor();}); + const Tensor& cx = c10::value_or_else(cx_opt, [] { return Tensor(); }); + const Tensor& fn_dropout_state = + c10::value_or_else(fn_dropout_state_opt, [] { return Tensor(); }); check_attributes(input_r, weight, {hx, cx}, /*check_dtype=*/true); auto input = input_r; @@ -1225,18 +1424,34 @@ std::tuple _cudnn_rnn( TORCH_WARN(WEIGHT_FORMAT_WARN); } if (fn_dropout_state.defined()) { - auto input_arg = TensorArg(input, "input", 1); - auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15); - checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg); + auto input_arg = TensorArg(input, "input", 1); + auto dropout_state_arg = TensorArg(fn_dropout_state, "dropout_states", 15); + checkSameGPU("cudnn_rnn", input_arg, dropout_state_arg); } RNNParams fn; auto datatype = getCudnnDataType(input); #ifndef USE_CUDNN_RNN_V8_API - fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #else auto input_size = input_r.size(-1); auto packed = fn_batch_sizes.size() != 0; - fn.rnn.set(fn_mode, input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + input_size, + packed, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #endif fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); @@ -1244,8 +1459,7 @@ std::tuple _cudnn_rnn( // TODO: Set device to input if (fn.rnn.mode != CUDNN_LSTM) { - TORCH_CHECK(!cx.defined(), - "rnn: illegal defined cx for non-LSTM RNN"); + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } // TODO: can batch_first be a wrapper around this function? @@ -1258,10 +1472,8 @@ std::tuple _cudnn_rnn( auto cell_size = _cell_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - TORCH_CHECK(hx.is_contiguous(), - "rnn: hx is not contiguous"); - TORCH_CHECK(!cx.defined() || cx.is_contiguous(), - "rnn: cx is not contiguous"); + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); auto output = at::empty(output_size, input.options()); @@ -1270,7 +1482,8 @@ std::tuple _cudnn_rnn( if (cx.defined()) { cy = at::empty(cell_size, cx.options()); } else { - cy = at::empty({0}, hx.options()); // NB: Not allowed to return undefined tensors + cy = at::empty( + {0}, hx.options()); // NB: Not allowed to return undefined tensors } auto y = output; @@ -1284,7 +1497,8 @@ std::tuple _cudnn_rnn( #endif if (!weight_buf.defined()) { #ifndef USE_CUDNN_RNN_V8_API - auto num_weights = get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], datatype); + auto num_weights = + get_num_weights(handle, descs.rnn_desc, descs.x_descs[0], datatype); #else auto num_weights = get_num_weights(handle, descs.rnn_desc, datatype); #endif @@ -1293,23 +1507,28 @@ std::tuple _cudnn_rnn( w_desc.set(weight_buf, 3); #endif weight_buf.zero_(); - std::vector params; - size_t params_stride0; #ifndef USE_CUDNN_RNN_V8_API - std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf); + auto [params, params_stride0] = get_parameters( + handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf); #else - std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, weight_buf); + auto [params, params_stride0] = + get_parameters(handle, fn.rnn, descs.rnn_desc, weight_buf); #endif - _copyParams(MatrixRef{weight, static_cast(weight_stride0)}, - MatrixRef{params, params_stride0}); + _copyParams( + MatrixRef{weight, static_cast(weight_stride0)}, + MatrixRef{params, params_stride0}); } else { #ifndef USE_CUDNN_RNN_V8_API w_desc.set(weight_buf, 3); #endif } - TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size), - "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes()); + TORCH_CHECK( + !cx.defined() || cx.sizes().equals(cell_size), + "Expected cell size ", + IntArrayRef{cell_size}, + ", got ", + cx.sizes()); size_t workspace_size; #ifndef USE_CUDNN_RNN_V8_API auto x_descs_arr = descs.get_x_descs(); @@ -1320,12 +1539,11 @@ std::tuple _cudnn_rnn( #endif #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), - &workspace_size - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size)); #endif Tensor workspace; Tensor reserve; @@ -1335,93 +1553,123 @@ std::tuple _cudnn_rnn( size_t reserve_size; #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnGetRNNTrainingReserveSize( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), - &reserve_size - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &reserve_size)); #else AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_TRAINING, - x_descs_arr.desc(), - &workspace_size, - &reserve_size - )); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_TRAINING, + x_descs_arr.desc(), + &workspace_size, + &reserve_size)); #endif workspace = at::empty(workspace_size, input.options().dtype(kByte)); reserve = at::empty(reserve_size, input.options().dtype(kByte)); #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnRNNForwardTraining( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), x.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, - w_desc.desc(), weight_buf.data_ptr(), - y_descs_arr.data(), y.data_ptr(), - descs.hy_desc.desc(), hy.data_ptr(), - descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr, - workspace.data_ptr(), workspace.size(0), - reserve.mutable_data_ptr(), reserve.size(0) - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + x.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + w_desc.desc(), + weight_buf.data_ptr(), + y_descs_arr.data(), + y.data_ptr(), + descs.hy_desc.desc(), + hy.data_ptr(), + descs.cy_desc.desc(), + cy.defined() ? cy.data_ptr() : nullptr, + workspace.data_ptr(), + workspace.size(0), + reserve.mutable_data_ptr(), + reserve.size(0))); #else AT_CUDNN_CHECK(cudnnRNNForward( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_TRAINING, - nullptr, - x_descs_arr.desc(), x.data_ptr(), - y_descs_arr.desc(), y.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), hy.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, cy.defined() ? cy.data_ptr() : nullptr, - weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), - workspace.size(0), workspace.data_ptr(), - reserve.size(0), reserve.mutable_data_ptr())); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_TRAINING, + nullptr, + x_descs_arr.desc(), + x.data_ptr(), + y_descs_arr.desc(), + y.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + hy.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + cy.defined() ? cy.data_ptr() : nullptr, + weight_buf.numel() * weight_buf.element_size(), + weight_buf.data_ptr(), + workspace.size(0), + workspace.data_ptr(), + reserve.size(0), + reserve.mutable_data_ptr())); #endif } else { // inference #ifdef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_INFERENCE, - x_descs_arr.desc(), - &workspace_size, - NULL - )); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_INFERENCE, + x_descs_arr.desc(), + &workspace_size, + NULL)); #endif workspace = at::empty(workspace_size, input.options().dtype(kByte)); reserve = at::empty({0}, input.options().dtype(kByte)); #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnRNNForwardInference( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), x.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, - w_desc.desc(), weight_buf.data_ptr(), - y_descs_arr.data(), y.data_ptr(), - descs.hy_desc.desc(), hy.data_ptr(), - descs.cy_desc.desc(), cy.defined() ? cy.data_ptr() : nullptr, - workspace.data_ptr(), workspace.size(0) - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + x.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + w_desc.desc(), + weight_buf.data_ptr(), + y_descs_arr.data(), + y.data_ptr(), + descs.hy_desc.desc(), + hy.data_ptr(), + descs.cy_desc.desc(), + cy.defined() ? cy.data_ptr() : nullptr, + workspace.data_ptr(), + workspace.size(0))); #else AT_CUDNN_CHECK(cudnnRNNForward( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_INFERENCE, - nullptr, - x_descs_arr.desc(), x.data_ptr(), - y_descs_arr.desc(), y.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), hy.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, cy.defined() ? cy.data_ptr() : nullptr, - weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), - workspace.size(0), workspace.data_ptr(), - reserve.size(0), reserve.mutable_data_ptr())); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_INFERENCE, + nullptr, + x_descs_arr.desc(), + x.data_ptr(), + y_descs_arr.desc(), + y.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + hy.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + cy.defined() ? cy.data_ptr() : nullptr, + weight_buf.numel() * weight_buf.element_size(), + weight_buf.data_ptr(), + workspace.size(0), + workspace.data_ptr(), + reserve.size(0), + reserve.mutable_data_ptr())); #endif } @@ -1433,16 +1681,26 @@ std::tuple _cudnn_rnn( } std::tuple _cudnn_rnn_backward_input( - const Tensor& input_r, const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, - const Tensor& output_r, const Tensor& grad_output_r, const Tensor& grad_hy, + const Tensor& input_r, + const Tensor& weight_buf, + const Tensor& hx, + const Tensor& cx, + const Tensor& output_r, + const Tensor& grad_output_r, + const Tensor& grad_hy, const Tensor& grad_cy, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, double fn_dropout, - bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, - const Tensor& fn_dropout_state, const Tensor& fn_reserve, - std::array output_mask - ) { - + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + double fn_dropout, + bool fn_train, + bool fn_bidirectional, + IntArrayRef fn_batch_sizes, + const Tensor& fn_dropout_state, + const Tensor& fn_reserve, + std::array output_mask) { auto input = input_r; auto grad_output = grad_output_r; auto output = output_r; @@ -1450,11 +1708,27 @@ std::tuple _cudnn_rnn_backward_input( RNNParams fn; auto datatype = getCudnnDataType(input); #ifndef USE_CUDNN_RNN_V8_API - fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #else auto cudnn_input_size = input_r.size(-1); auto packed = fn_batch_sizes.size() != 0; - fn.rnn.set(fn_mode, cudnn_input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + cudnn_input_size, + packed, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #endif fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); @@ -1463,8 +1737,7 @@ std::tuple _cudnn_rnn_backward_input( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - TORCH_CHECK(!cx.defined(), - "rnn: illegal defined cx for non-LSTM RNN"); + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -1479,41 +1752,68 @@ std::tuple _cudnn_rnn_backward_input( auto cell_size = _cell_size(fn.rnn, fn.tensors); auto output_size = _output_size(fn.rnn, fn.tensors); - TORCH_CHECK(hx.is_contiguous(), - "rnn: hx is not contiguous"); - TORCH_CHECK(!cx.defined() || cx.is_contiguous(), - "rnn: cx is not contiguous"); + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); auto dy = grad_output.contiguous(); auto y = output; auto w = weight_buf; - auto dx = at::empty(input.sizes(), input.options()); // TODO: more compact way of saying this + auto dx = at::empty( + input.sizes(), input.options()); // TODO: more compact way of saying this auto dhy = grad_hy.contiguous().view(hidden_size); - auto dcy = grad_cy.defined() ? grad_cy.contiguous().view(cell_size) : Tensor(); + auto dcy = + grad_cy.defined() ? grad_cy.contiguous().view(cell_size) : Tensor(); auto dhx = at::empty(hidden_size, hx.options()); - TORCH_INTERNAL_ASSERT(cx.defined() || !output_mask[2], "illegally required grad of cx for non-LSTM RNN"); + TORCH_INTERNAL_ASSERT( + cx.defined() || !output_mask[2], + "illegally required grad of cx for non-LSTM RNN"); auto dcx = cx.defined() ? at::empty(cell_size, cx.options()) : Tensor(); - TORCH_CHECK(fn_train, - "cudnn RNN backward can only be called in training mode"); + TORCH_CHECK( + fn_train, "cudnn RNN backward can only be called in training mode"); - TORCH_CHECK(input.sizes().equals(input_size), - "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes()); - TORCH_CHECK(output.sizes().equals(output_size), - "Expected output size ", IntArrayRef{output_size}, ", got ", output.sizes()); + TORCH_CHECK( + input.sizes().equals(input_size), + "Expected input size ", + IntArrayRef{input_size}, + ", got ", + input.sizes()); + TORCH_CHECK( + output.sizes().equals(output_size), + "Expected output size ", + IntArrayRef{output_size}, + ", got ", + output.sizes()); - TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), - "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes()); - TORCH_CHECK(!cx.defined() || cx.sizes().equals(cell_size), - "Expected cell size ", IntArrayRef{cell_size}, ", got ", cx.sizes()); - TORCH_CHECK(!dhy.defined() || dhy.sizes().equals(hidden_size), - "Expected d_hidden size ", IntArrayRef{hidden_size}, ", got ", dhy.sizes()); - TORCH_CHECK(!dcy.defined() || dcy.sizes().equals(cell_size), - "Expected d_cell size ", IntArrayRef{cell_size}, ", got ", dcy.sizes()); + TORCH_CHECK( + !hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", + IntArrayRef{hidden_size}, + ", got ", + hx.sizes()); + TORCH_CHECK( + !cx.defined() || cx.sizes().equals(cell_size), + "Expected cell size ", + IntArrayRef{cell_size}, + ", got ", + cx.sizes()); + TORCH_CHECK( + !dhy.defined() || dhy.sizes().equals(hidden_size), + "Expected d_hidden size ", + IntArrayRef{hidden_size}, + ", got ", + dhy.sizes()); + TORCH_CHECK( + !dcy.defined() || dcy.sizes().equals(cell_size), + "Expected d_cell size ", + IntArrayRef{cell_size}, + ", got ", + dcy.sizes()); - TORCH_CHECK(dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), - "Gradients aren't CUDA tensors"); + TORCH_CHECK( + dhy.is_cuda() && dy.is_cuda() && (!dcy.defined() || dcy.is_cuda()), + "Gradients aren't CUDA tensors"); cudnnRNNAlgo_t algo = get_algo(fn.rnn, fn.tensors, input, false); fn.rnn.set_algo(algo); @@ -1529,61 +1829,77 @@ std::tuple _cudnn_rnn_backward_input( auto x_descs_arr = descs.get_x_descs(); auto y_descs_arr = descs.get_y_descs(); AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), - &workspace_size - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size)); #else auto& x_descs_arr = descs.x_descs; auto& y_descs_arr = descs.y_descs; AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_TRAINING, - x_descs_arr.desc(), - &workspace_size, - NULL - )); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_TRAINING, + x_descs_arr.desc(), + &workspace_size, + NULL)); #endif // TODO: put this in the correct device??? Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte)); #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnRNNBackwardData( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - y_descs_arr.data(), y.data_ptr(), - y_descs_arr.data(), dy.data_ptr(), - descs.hy_desc.desc(), dhy.data_ptr(), - descs.cy_desc.desc(), cx.defined() ? dcy.data_ptr() : nullptr, - w_desc.desc(), w.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, - x_descs_arr.data(), dx.data_ptr(), - descs.hx_desc.desc(), dhx.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? dcx.data_ptr() : nullptr, - workspace.data_ptr(), workspace.size(0), - fn_reserve.data_ptr(), fn_reserve.size(0) - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + y_descs_arr.data(), + y.data_ptr(), + y_descs_arr.data(), + dy.data_ptr(), + descs.hy_desc.desc(), + dhy.data_ptr(), + descs.cy_desc.desc(), + cx.defined() ? dcy.data_ptr() : nullptr, + w_desc.desc(), + w.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + x_descs_arr.data(), + dx.data_ptr(), + descs.hx_desc.desc(), + dhx.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? dcx.data_ptr() : nullptr, + workspace.data_ptr(), + workspace.size(0), + fn_reserve.data_ptr(), + fn_reserve.size(0))); #else AT_CUDNN_CHECK(cudnnRNNBackwardData_v8( - handle, - descs.rnn_desc.desc(), - nullptr, - y_descs_arr.desc(), y.data_ptr(), - dy.data_ptr(), - x_descs_arr.desc(), dx.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - dhy.data_ptr(), - dhx.data_ptr(), - descs.cx_desc.desc(), cx.defined() ? cx.data_ptr() : nullptr, - cx.defined() ? dcy.data_ptr() : nullptr, - cx.defined() ? dcx.data_ptr() : nullptr, - weight_buf.numel() * weight_buf.element_size(), weight_buf.data_ptr(), - workspace.size(0), workspace.data_ptr(), - fn_reserve.size(0), fn_reserve.data_ptr())); + handle, + descs.rnn_desc.desc(), + nullptr, + y_descs_arr.desc(), + y.data_ptr(), + dy.data_ptr(), + x_descs_arr.desc(), + dx.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + dhy.data_ptr(), + dhx.data_ptr(), + descs.cx_desc.desc(), + cx.defined() ? cx.data_ptr() : nullptr, + cx.defined() ? dcy.data_ptr() : nullptr, + cx.defined() ? dcx.data_ptr() : nullptr, + weight_buf.numel() * weight_buf.element_size(), + weight_buf.data_ptr(), + workspace.size(0), + workspace.data_ptr(), + fn_reserve.size(0), + fn_reserve.data_ptr())); #endif if (batch_first && !is_input_packed) { dx = dx.transpose_(0, 1); @@ -1596,27 +1912,52 @@ std::tuple _cudnn_rnn_backward_input( // We'll give a user friendly combined function... std::vector _cudnn_rnn_backward_weight( // TODO: I think tensor geometry sufficient for weight_buf/weight - const Tensor& input_r, TensorList weight_arr, int64_t weight_stride0, - const Tensor& weight_buf, const Tensor& hx, const Tensor& cx, + const Tensor& input_r, + TensorList weight_arr, + int64_t weight_stride0, + const Tensor& weight_buf, + const Tensor& hx, + const Tensor& cx, const Tensor& output_r, - int64_t fn_mode, int64_t fn_hidden_size, int64_t fn_proj_size, - int64_t fn_num_layers, bool batch_first, double fn_dropout, - bool fn_train, bool fn_bidirectional, IntArrayRef fn_batch_sizes, - const Tensor& fn_dropout_state, const Tensor& fn_reserve - ) { - - MatrixRef weight{ weight_arr, static_cast(weight_stride0) }; + int64_t fn_mode, + int64_t fn_hidden_size, + int64_t fn_proj_size, + int64_t fn_num_layers, + bool batch_first, + double fn_dropout, + bool fn_train, + bool fn_bidirectional, + IntArrayRef fn_batch_sizes, + const Tensor& fn_dropout_state, + const Tensor& fn_reserve) { + MatrixRef weight{weight_arr, static_cast(weight_stride0)}; auto input = input_r; auto output = output_r; RNNParams fn; auto datatype = getCudnnDataType(input); #ifndef USE_CUDNN_RNN_V8_API - fn.rnn.set(fn_mode, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #else auto cudnn_input_size = input_r.size(-1); auto packed = fn_batch_sizes.size() != 0; - fn.rnn.set(fn_mode, cudnn_input_size, packed, fn_hidden_size, fn_proj_size, fn_num_layers, fn_bidirectional, promote_rnn_math_type(datatype), datatype); + fn.rnn.set( + fn_mode, + cudnn_input_size, + packed, + fn_hidden_size, + fn_proj_size, + fn_num_layers, + fn_bidirectional, + promote_rnn_math_type(datatype), + datatype); #endif fn.dropout.set(fn_train, fn_dropout, fn_dropout_state); fn.tensors.set(input.sizes(), fn_batch_sizes, batch_first); @@ -1624,8 +1965,7 @@ std::vector _cudnn_rnn_backward_weight( auto handle = getCudnnHandle(); if (fn.rnn.mode != CUDNN_LSTM) { - TORCH_CHECK(!cx.defined(), - "rnn: illegal defined cx for non-LSTM RNN"); + TORCH_CHECK(!cx.defined(), "rnn: illegal defined cx for non-LSTM RNN"); } auto is_input_packed = fn_batch_sizes.size() != 0; @@ -1637,21 +1977,27 @@ std::vector _cudnn_rnn_backward_weight( auto input_size = _input_size(fn.tensors); auto hidden_size = _hidden_size(fn.rnn, fn.tensors); - TORCH_CHECK(fn_train, - "cudnn RNN backward can only be called in training mode"); + TORCH_CHECK( + fn_train, "cudnn RNN backward can only be called in training mode"); - TORCH_CHECK(input.sizes().equals(input_size), - "Expected input size ", IntArrayRef{input_size}, ", got ", input.sizes()); - TORCH_CHECK(!hx.defined() || hx.sizes().equals(hidden_size), - "Expected hidden size ", IntArrayRef{hidden_size}, ", got ", hx.sizes()); + TORCH_CHECK( + input.sizes().equals(input_size), + "Expected input size ", + IntArrayRef{input_size}, + ", got ", + input.sizes()); + TORCH_CHECK( + !hx.defined() || hx.sizes().equals(hidden_size), + "Expected hidden size ", + IntArrayRef{hidden_size}, + ", got ", + hx.sizes()); // TODO: the above were the only checks in rnn.py, but it doesn't seem // like these checks are enough - TORCH_CHECK(hx.is_contiguous(), - "rnn: hx is not contiguous"); - TORCH_CHECK(!cx.defined() || cx.is_contiguous(), - "rnn: cx is not contiguous"); + TORCH_CHECK(hx.is_contiguous(), "rnn: hx is not contiguous"); + TORCH_CHECK(!cx.defined() || cx.is_contiguous(), "rnn: cx is not contiguous"); auto x = input.contiguous(); const auto& y = output; @@ -1671,109 +2017,188 @@ std::vector _cudnn_rnn_backward_weight( auto x_descs_arr = descs.get_x_descs(); auto y_descs_arr = descs.get_y_descs(); AT_CUDNN_CHECK(cudnnGetRNNWorkspaceSize( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), - &workspace_size - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + &workspace_size)); #else auto& x_descs_arr = descs.x_descs; auto& y_descs_arr = descs.y_descs; AT_CUDNN_CHECK(cudnnGetRNNTempSpaceSizes( - handle, - descs.rnn_desc.desc(), - CUDNN_FWD_MODE_TRAINING, - x_descs_arr.desc(), - &workspace_size, - NULL - )); + handle, + descs.rnn_desc.desc(), + CUDNN_FWD_MODE_TRAINING, + x_descs_arr.desc(), + &workspace_size, + NULL)); #endif Tensor workspace = at::empty(workspace_size, input.options().dtype(kByte)); #ifndef USE_CUDNN_RNN_V8_API AT_CUDNN_CHECK(cudnnRNNBackwardWeights( - handle, - descs.rnn_desc.desc(), - fn.tensors.seq_length, - x_descs_arr.data(), x.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - y_descs_arr.data(), y.data_ptr(), - workspace.data_ptr(), workspace.size(0), - w_desc.desc(), dw.data_ptr(), - fn_reserve.data_ptr(), fn_reserve.size(0) - )); + handle, + descs.rnn_desc.desc(), + fn.tensors.seq_length, + x_descs_arr.data(), + x.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + y_descs_arr.data(), + y.data_ptr(), + workspace.data_ptr(), + workspace.size(0), + w_desc.desc(), + dw.data_ptr(), + fn_reserve.data_ptr(), + fn_reserve.size(0))); #else AT_CUDNN_CHECK(cudnnRNNBackwardWeights_v8( - handle, - descs.rnn_desc.desc(), - CUDNN_WGRAD_MODE_ADD, - nullptr, - x_descs_arr.desc(), x.data_ptr(), - descs.hx_desc.desc(), hx.data_ptr(), - y_descs_arr.desc(), y.data_ptr(), - weight_buf.numel() * weight_buf.element_size(), dw.data_ptr(), - workspace.size(0), workspace.data_ptr(), - fn_reserve.size(0), fn_reserve.data_ptr())); + handle, + descs.rnn_desc.desc(), + CUDNN_WGRAD_MODE_ADD, + nullptr, + x_descs_arr.desc(), + x.data_ptr(), + descs.hx_desc.desc(), + hx.data_ptr(), + y_descs_arr.desc(), + y.data_ptr(), + weight_buf.numel() * weight_buf.element_size(), + dw.data_ptr(), + workspace.size(0), + workspace.data_ptr(), + fn_reserve.size(0), + fn_reserve.data_ptr())); #endif - - std::vector grad_params_arr; - size_t grad_params_stride0; #ifndef USE_CUDNN_RNN_V8_API - std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw); + auto [grad_params_arr, grad_params_stride0] = get_parameters( + handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw); #else - std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, dw); + auto [grad_params_arr, grad_params_stride0] = + get_parameters(handle, fn.rnn, descs.rnn_desc, dw); #endif if (grad_params_stride0 == static_cast(weight_stride0)) { - _viewParams(MatrixRef{grad_params_arr, grad_params_stride0}, - MatrixRef{weight_arr, static_cast(weight_stride0)}); - return grad_params_arr; + _viewParams( + MatrixRef{grad_params_arr, grad_params_stride0}, + MatrixRef{weight_arr, static_cast(weight_stride0)}); + return grad_params_arr; } else { - std::vector grad_weight_arr; - grad_weight_arr.reserve( weight.numel() ); - for (const auto& w : weight_arr) { - grad_weight_arr.emplace_back(at::empty(w.sizes(), w.options())); - } - _copyParams(MatrixRef{grad_params_arr, grad_params_stride0}, - MatrixRef{grad_weight_arr, static_cast(weight_stride0)}); - return grad_weight_arr; + std::vector grad_weight_arr; + grad_weight_arr.reserve(weight.numel()); + for (const auto& w : weight_arr) { + grad_weight_arr.emplace_back(at::empty(w.sizes(), w.options())); + } + _copyParams( + MatrixRef{grad_params_arr, grad_params_stride0}, + MatrixRef{ + grad_weight_arr, static_cast(weight_stride0)}); + return grad_weight_arr; } } // We need this dispatcher because _cudnn_rnn_backward_weight has a stringent // ordering requirement with _cudnn_rnn_backward_input std::tuple> _cudnn_rnn_backward( - const Tensor& input, TensorList weight, int64_t weight_stride0, const Tensor& weight_buf, const Tensor& hx, const c10::optional& cx_opt, - const Tensor& output, const c10::optional& grad_output_r_opt, const c10::optional& grad_hy_r_opt, const c10::optional& grad_cy_r_opt, - int64_t mode, int64_t hidden_size, int64_t proj_size, - int64_t num_layers, bool batch_first, double dropout, - bool train, bool bidirectional, IntArrayRef batch_sizes, const c10::optional& dropout_state_opt, const Tensor& reserve, - std::array output_mask - ) { + const Tensor& input, + TensorList weight, + int64_t weight_stride0, + const Tensor& weight_buf, + const Tensor& hx, + const c10::optional& cx_opt, + const Tensor& output, + const c10::optional& grad_output_r_opt, + const c10::optional& grad_hy_r_opt, + const c10::optional& grad_cy_r_opt, + int64_t mode, + int64_t hidden_size, + int64_t proj_size, + int64_t num_layers, + bool batch_first, + double dropout, + bool train, + bool bidirectional, + IntArrayRef batch_sizes, + const c10::optional& dropout_state_opt, + const Tensor& reserve, + std::array output_mask) { // See [Note: hacky wrapper removal for optional tensor] - c10::MaybeOwned cx_maybe_owned = at::borrow_from_optional_tensor(cx_opt); + c10::MaybeOwned cx_maybe_owned = + at::borrow_from_optional_tensor(cx_opt); const Tensor& cx = *cx_maybe_owned; - const Tensor& grad_output_r = c10::value_or_else(grad_output_r_opt, [] {return Tensor();}); - const Tensor& grad_hy_r = c10::value_or_else(grad_hy_r_opt, [] {return Tensor();}); - const Tensor& grad_cy_r = c10::value_or_else(grad_cy_r_opt, [] {return Tensor();}); - const Tensor& dropout_state = c10::value_or_else(dropout_state_opt, [] {return Tensor();}); - - if (!grad_output_r.defined() && !grad_hy_r.defined() && !grad_cy_r.defined()) { - return std::tuple>(Tensor(), Tensor(), Tensor(), std::vector(weight.size())); + const Tensor& grad_output_r = + c10::value_or_else(grad_output_r_opt, [] { return Tensor(); }); + const Tensor& grad_hy_r = + c10::value_or_else(grad_hy_r_opt, [] { return Tensor(); }); + const Tensor& grad_cy_r = + c10::value_or_else(grad_cy_r_opt, [] { return Tensor(); }); + const Tensor& dropout_state = + c10::value_or_else(dropout_state_opt, [] { return Tensor(); }); + + if (!grad_output_r.defined() && !grad_hy_r.defined() && + !grad_cy_r.defined()) { + return std::tuple>( + Tensor(), Tensor(), Tensor(), std::vector(weight.size())); } - auto grad_output = grad_output_r.defined() ? grad_output_r : at::zeros_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT)) : grad_cy_r; + auto grad_output = grad_output_r.defined() + ? grad_output_r + : at::zeros_like(output, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_hy = grad_hy_r.defined() + ? grad_hy_r + : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto grad_cy = cx.defined() + ? (grad_cy_r.defined() + ? grad_cy_r + : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT)) + : grad_cy_r; - Tensor dx, dhx, dcx; // NB: unconditionally compute this gradient, because it mutates reserve - std::tie(dx, dhx, dcx) = at::native::_cudnn_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]}); + auto [dx, dhx, dcx] = at::native::_cudnn_rnn_backward_input( + input, + weight_buf, + hx, + cx, + output, + grad_output, + grad_hy, + grad_cy, + mode, + hidden_size, + proj_size, + num_layers, + batch_first, + dropout, + train, + bidirectional, + batch_sizes, + dropout_state, + reserve, + {output_mask[0], output_mask[1], output_mask[2]}); std::vector dw; if (output_mask[3]) { - dw = at::native::_cudnn_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, proj_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve); + dw = at::native::_cudnn_rnn_backward_weight( + input, + weight, + weight_stride0, + weight_buf, + hx, + cx, + output, + mode, + hidden_size, + proj_size, + num_layers, + batch_first, + dropout, + train, + bidirectional, + batch_sizes, + dropout_state, + reserve); } - return std::tuple>{dx, dhx, dcx, dw}; + return std::tuple>{ + dx, dhx, dcx, dw}; } // TODO: I am not sure if we actually need the 'dropout' and 'train' parameters @@ -1785,13 +2210,18 @@ std::tuple> _cudnn_rnn_backward( // as input. The codegen currently assumes that ALL factory functions // take TensorOptions, so it's just a lot easier for this function to // be bound if it also does it. -Tensor _cudnn_init_dropout_state(double dropout, bool train, int64_t dropout_seed, +Tensor _cudnn_init_dropout_state( + double dropout, + bool train, + int64_t dropout_seed, c10::optional dtype, c10::optional layout, c10::optional device, c10::optional pin_memory) { // See [Note: hacky wrapper removal for TensorOptions] - TensorOptions options = TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory(pin_memory); + TensorOptions options = + TensorOptions().dtype(dtype).layout(layout).device(device).pinned_memory( + pin_memory); auto handle = getCudnnHandle(); DropoutDescriptor dropout_desc; @@ -1811,82 +2241,94 @@ std::tuple unpack_hidden(const Tensor& hidden) { return std::make_tuple(hidden, at::Tensor{}); } -std::tuple unpack_hidden(const std::tuple& hidden) { +std::tuple unpack_hidden( + const std::tuple& hidden) { return hidden; } -template +template hidden_type pack_hidden(const Tensor& hx, const Tensor& cx) { - static_assert(std::is_same::value, "pack_hidden not implemented for this type"); + static_assert( + std::is_same::value, + "pack_hidden not implemented for this type"); AT_ERROR("NOT IMPLEMENTED"); } -template<> +template <> Tensor pack_hidden(const Tensor& hx, const Tensor& cx) { AT_ASSERT(cx.numel() == 0); return hx; } -template<> -std::tuple pack_hidden>(const Tensor& hx, const Tensor& cx) { +template <> +std::tuple pack_hidden>( + const Tensor& hx, + const Tensor& cx) { return std::make_tuple(hx, cx); } /** * Note [DropoutState and CUDA graph capture] * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ - * (1) Telling a capturing stream to wait on an event recorded in a non-capturing stream is an error. - * (2) Telling a non-capturing stream to wait on an event recorded during capture is also an error. + * (1) Telling a capturing stream to wait on an event recorded in a + non-capturing stream is an error. + * (2) Telling a non-capturing stream to wait on an event recorded during + capture is also an error. * - * So DropoutState's usage syncs could error if an RNN with dropout is called in an uncaptured region - * then called in a captured region (triggering 1), or called in a captured region then called - # in an uncaptured region (triggering 2). + * So DropoutState's usage syncs could error if an RNN with dropout is called in + an uncaptured region + * then called in a captured region (triggering 1), or called in a captured + region then called # in an uncaptured region (triggering 2). * - * To prevent 1 and 2, lock() only syncs on the last usage event if it was recorded in the same - * capture state as the current state (which also means the same graph, if capture is in progress). + * To prevent 1 and 2, lock() only syncs on the last usage event if it was + recorded in the same + * capture state as the current state (which also means the same graph, if + capture is in progress). * - * The solution should be safe as long as capture obeys the following restrictions: + * The solution should be safe as long as capture obeys the following + restrictions: * - Only one capture may be underway at a time in a given process. - * - While a capture is underway, no calls to eager ops on noncapturing streams (on any thread) + * - While a capture is underway, no calls to eager ops on noncapturing streams + (on any thread) * may interleave with the captured ops. * - * TODO: As people experiment with capture, keep an eye out for use cases that might need to + * TODO: As people experiment with capture, keep an eye out for use cases that + might need to * relax those restrictions. * * See https://github.com/pytorch/pytorch/pull/56433 for more discussion. */ struct DropoutState { - // Both buffer and event are lazily instantiated when a dropout state is needed - // for the first time. Note that in this case needed != used, as we don't need - // a buffer to e.g. run RNNs in test mode. + // Both buffer and event are lazily instantiated when a dropout state is + // needed for the first time. Note that in this case needed != used, as we + // don't need a buffer to e.g. run RNNs in test mode. at::Tensor buffer; c10::optional event; std::mutex mutex; #if !defined(USE_ROCM) - // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can serve - // as a sentinel value that capture was not underway. + // cudaStreamGetCaptureInfo will never give back a capture id of 0, so 0 can + // serve as a sentinel value that capture was not underway. cuda::CaptureId_t capture_id_last_lock = 0; cuda::CaptureId_t capture_id_last_unlock = 0; #endif // Every time we use a dropout state, we need to synchronize with its event, // to make sure all previous uses finish running before this one starts. Once - // we're done, we record the event to allow others to synchronize with this kernel. - // Those events are really needed only for inter-stream sync on a single GPU. - // I doubt anyone will want to run cuDNN RNNs in parallel on a single GPU, so - // they should end up being complete no-ops. + // we're done, we record the event to allow others to synchronize with this + // kernel. Those events are really needed only for inter-stream sync on a + // single GPU. I doubt anyone will want to run cuDNN RNNs in parallel on a + // single GPU, so they should end up being complete no-ops. void lock() { - // NB: We can't ignore the lock even when event is undefined, because someone - // could then define it before we get to unlock(). + // NB: We can't ignore the lock even when event is undefined, because + // someone could then define it before we get to unlock(). mutex.lock(); if (event) { #if !defined(USE_ROCM) // See Note [DropoutState and CUDA graph capture] cudaStreamCaptureStatus status; - AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(), - &status, - &capture_id_last_lock)); + AT_CUDA_CHECK(cudaStreamGetCaptureInfo( + cuda::getCurrentCUDAStream(), &status, &capture_id_last_lock)); if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) { capture_id_last_lock = 0; } @@ -1905,9 +2347,8 @@ struct DropoutState { #if !defined(USE_ROCM) // See Note [DropoutState and CUDA graph capture] cudaStreamCaptureStatus status; - AT_CUDA_CHECK(cudaStreamGetCaptureInfo(cuda::getCurrentCUDAStream(), - &status, - &capture_id_last_unlock)); + AT_CUDA_CHECK(cudaStreamGetCaptureInfo( + cuda::getCurrentCUDAStream(), &status, &capture_id_last_unlock)); if (status == cudaStreamCaptureStatus::cudaStreamCaptureStatusNone) { capture_id_last_unlock = 0; } @@ -1918,27 +2359,34 @@ struct DropoutState { } }; -DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions options) { - // Each state is slightly over 2MB and initialized lazily, so it's fine to cache them. - static std::vector dropout_state_cache { static_cast(cuda::getNumGPUs()) }; +DropoutState& get_dropout_state( + double dropout_p, + bool train, + TensorOptions options) { + // Each state is slightly over 2MB and initialized lazily, so it's fine to + // cache them. + static std::vector dropout_state_cache{ + static_cast(cuda::getNumGPUs())}; static std::mutex state_cache_mut; AT_ASSERT(options.device().is_cuda()); - int device = options.device().index(); + auto device = options.device().index(); - std::unique_lock lock {state_cache_mut}; + std::unique_lock lock{state_cache_mut}; auto& state = dropout_state_cache.at(device); if (train && dropout_p > 0) { - const auto &gen = at::detail::getCUDAHooks().getDefaultCUDAGenerator(device); + const auto& gen = + at::detail::getCUDAHooks().getDefaultCUDAGenerator(device); auto gen_impl = gen.get(); bool reset_rnn_state = gen_impl->reset_rnn_state(); if (!state.buffer.defined() || reset_rnn_state) { - std::unique_lock lock {state.mutex}; - int64_t seed = at::empty({}, options.dtype(at::kLong)).random_(gen).item(); + std::unique_lock lock{state.mutex}; + int64_t seed = + at::empty({}, options.dtype(at::kLong)).random_(gen).item(); state.buffer = at::_cudnn_init_dropout_state( dropout_p, train, seed, options.dtype(at::kByte)); - // NB: CUDA binds the event to a device at creation time, so we can initialize it - // only now, when we know we're on the correct device. + // NB: CUDA binds the event to a device at creation time, so we can + // initialize it only now, when we know we're on the correct device. if (!state.event.has_value()) { state.event.emplace(); } @@ -1948,12 +2396,17 @@ DropoutState& get_dropout_state(double dropout_p, bool train, TensorOptions opti } Tensor try_get_weight_buf( - const Tensor& input, TensorList parameters, bool has_biases, - cudnnRNNMode_t mode, c10::SymInt hidden_size, c10::SymInt proj_size, int64_t num_layers, bool bidirectional) { - + const Tensor& input, + TensorList parameters, + bool has_biases, + cudnnRNNMode_t mode, + c10::SymInt hidden_size, + c10::SymInt proj_size, + int64_t num_layers, + bool bidirectional) { // Prepare all relevant descriptors auto handle = getCudnnHandle(); - auto & any_param = parameters.at(0); + auto& any_param = parameters.at(0); auto datatype = getCudnnDataType(any_param); // Something very naughty is happening here. try_get_weight_buf @@ -1965,20 +2418,36 @@ Tensor try_get_weight_buf( // the relationships RNNDescriptorParams rnn; #ifndef USE_CUDNN_RNN_V8_API - rnn.set(mode, hidden_size.guard_int(__FILE__, __LINE__), proj_size.guard_int(__FILE__, __LINE__), num_layers, bidirectional, promote_rnn_math_type(datatype), datatype); + rnn.set( + mode, + hidden_size.guard_int(__FILE__, __LINE__), + proj_size.guard_int(__FILE__, __LINE__), + num_layers, + bidirectional, + promote_rnn_math_type(datatype), + datatype); #else auto cudnn_input_size = input.size(-1); - auto packed = false; // eqy: bogus as we do not know if the input is packed here - // again, it should also not affect the weights - rnn.set(mode, cudnn_input_size, packed, hidden_size.guard_int(__FILE__, __LINE__), proj_size.guard_int(__FILE__, __LINE__), num_layers, bidirectional, promote_rnn_math_type(datatype), datatype); + auto packed = false; // eqy: bogus as we do not know if the input is packed + // here again, it should also not affect the weights + rnn.set( + mode, + cudnn_input_size, + packed, + hidden_size.guard_int(__FILE__, __LINE__), + proj_size.guard_int(__FILE__, __LINE__), + num_layers, + bidirectional, + promote_rnn_math_type(datatype), + datatype); #endif RNNDescriptor rnn_desc = rnn.descriptor(handle); - TensorGeometry x_geom ({1, input.sym_size(-1).guard_int(__FILE__, __LINE__)}); + TensorGeometry x_geom({1, input.sym_size(-1).guard_int(__FILE__, __LINE__)}); TensorDescriptor x_desc; // datatype for x_desc comes from any_param, not input. - // try_get_weight_buf's job is to check "is the weight buffer correctly laid out - // for us to run it with input of the same datatype?" + // try_get_weight_buf's job is to check "is the weight buffer correctly laid + // out for us to run it with input of the same datatype?" x_desc.set(datatype, x_geom.sizes(), x_geom.strides(), 5); #ifndef USE_CUDNN_RNN_V8_API @@ -2008,40 +2477,51 @@ Tensor try_get_weight_buf( if (has_biases) { AT_ASSERT(num_ptrs == num_parameters); for (const auto i : c10::irange(num_parameters)) { - if (expected_data_ptrs[i] != parameters[i].data_ptr()) return {}; + if (expected_data_ptrs[i] != parameters[i].data_ptr()) + return {}; } } else { AT_ASSERT(num_parameters % 3 == 0); AT_ASSERT(num_ptrs == num_parameters * 5 / 3); - for (int64_t param_i = 0, ptr_i = 0; - ptr_i < num_ptrs; - ptr_i += 5, param_i += 3) { - if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {}; - if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {}; - if (expected_data_ptrs[ptr_i + 4] != parameters[param_i + 2].data_ptr()) return {}; + for (int64_t param_i = 0, ptr_i = 0; ptr_i < num_ptrs; + ptr_i += 5, param_i += 3) { + if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) + return {}; + if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) + return {}; + if (expected_data_ptrs[ptr_i + 4] != parameters[param_i + 2].data_ptr()) + return {}; } } } else { AT_ASSERT(num_ptrs == (num_parameters * (has_biases ? 1 : 2))); AT_ASSERT(num_parameters % (has_biases ? 4 : 2) == 0); - for (int64_t param_i = 0, ptr_i = 0; - ptr_i < num_ptrs; - ptr_i += (has_biases ? 2 : 4), param_i += 2) { - if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) return {}; - if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) return {}; + for (int64_t param_i = 0, ptr_i = 0; ptr_i < num_ptrs; + ptr_i += (has_biases ? 2 : 4), param_i += 2) { + if (expected_data_ptrs[ptr_i] != parameters[param_i].data_ptr()) + return {}; + if (expected_data_ptrs[ptr_i + 1] != parameters[param_i + 1].data_ptr()) + return {}; } } - if (!parameters[num_parameters - 1].is_contiguous()) return {}; + if (!parameters[num_parameters - 1].is_contiguous()) + return {}; return weight_buf; } -template +template std::pair _cudnn_impl( - const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden, - TensorList params, bool has_biases, cudnnRNNMode_t mode, - int64_t num_layers, double dropout_p, bool train, bool bidirectional) { - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); + const Tensor& input, + const Tensor& _batch_sizes, + const hidden_type& hidden, + TensorList params, + bool has_biases, + cudnnRNNMode_t mode, + int64_t num_layers, + double dropout_p, + bool train, + bool bidirectional) { + auto [hx, cx] = unpack_hidden(hidden); auto hidden_size = hx.sym_size(2); SymInt proj_size = 0; // For LSTM models with projections hidden size could be different @@ -2050,40 +2530,72 @@ std::pair _cudnn_impl( proj_size = hx.sym_size(2); } - // TODO: try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a c10::optional - // in weight_buf's slot. Do we want try_get_weight_buf to return a c10::optional - // instead of a defined or undefined Tensor? + // TODO: try_get_weight_buf returns a Tensor, but _cudnn_rnn below takes a + // c10::optional in weight_buf's slot. Do we want try_get_weight_buf + // to return a c10::optional instead of a defined or undefined Tensor? at::cuda::OptionalCUDAGuard guard(input.get_device()); auto weight_buf = try_get_weight_buf( - input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional); + input, + params, + has_biases, + mode, + hidden_size, + proj_size, + num_layers, + bidirectional); TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); - IntArrayRef batch_sizes { _batch_sizes.data_ptr(), static_cast(_batch_sizes.size(0)) }; + IntArrayRef batch_sizes{ + _batch_sizes.data_ptr(), + static_cast(_batch_sizes.size(0))}; - auto & dropout_state = get_dropout_state(dropout_p, train, input.options()); - std::unique_lock lock { dropout_state }; + auto& dropout_state = get_dropout_state(dropout_p, train, input.options()); + std::unique_lock lock{dropout_state}; int64_t num_params = has_biases ? 4 : 2; if (proj_size != 0) { ++num_params; } - auto sym_batch_sizes = c10::SymIntArrayRef(reinterpret_cast(batch_sizes.data()), batch_sizes.size()); + auto sym_batch_sizes = c10::SymIntArrayRef( + reinterpret_cast(batch_sizes.data()), + batch_sizes.size()); // cudnn_output = std::tuple auto cudnn_output = at::_cudnn_rnn_symint( - input, params, num_params, weight_buf, - hx, cx, static_cast(mode), hidden_size, proj_size, num_layers, /*batch_first=*/false, - dropout_p, train, bidirectional, sym_batch_sizes, dropout_state.buffer); + input, + params, + num_params, + weight_buf, + hx, + cx, + static_cast(mode), + hidden_size, + proj_size, + num_layers, + /*batch_first=*/false, + dropout_p, + train, + bidirectional, + sym_batch_sizes, + dropout_state.buffer); - return {std::get<0>(cudnn_output), - pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; + return { + std::get<0>(cudnn_output), + pack_hidden( + std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; } -template +template std::pair _cudnn_impl( - const Tensor& input, const hidden_type& hidden, - TensorList params, bool has_biases, cudnnRNNMode_t mode, - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); + const Tensor& input, + const hidden_type& hidden, + TensorList params, + bool has_biases, + cudnnRNNMode_t mode, + int64_t num_layers, + double dropout_p, + bool train, + bool bidirectional, + bool batch_first) { + auto [hx, cx] = unpack_hidden(hidden); auto hidden_size = hx.sym_size(2); c10::SymInt proj_size = 0; // For LSTM models with projections hidden size could be different @@ -2093,64 +2605,156 @@ std::pair _cudnn_impl( } at::cuda::OptionalCUDAGuard guard(input.get_device()); auto weight_buf = try_get_weight_buf( - input, params, has_biases, mode, hidden_size, proj_size, num_layers, bidirectional); - auto & dropout_state = get_dropout_state(dropout_p, train, input.options()); - std::unique_lock lock { dropout_state }; + input, + params, + has_biases, + mode, + hidden_size, + proj_size, + num_layers, + bidirectional); + auto& dropout_state = get_dropout_state(dropout_p, train, input.options()); + std::unique_lock lock{dropout_state}; int64_t num_params = has_biases ? 4 : 2; if (proj_size != 0) { ++num_params; } // cudnn_output = std::tuple auto cudnn_output = at::_cudnn_rnn_symint( - input, params, num_params, weight_buf, - hx, cx, static_cast(mode), hidden_size, proj_size, num_layers, batch_first, dropout_p, - train, bidirectional, /*batch_sizes=*/{}, dropout_state.buffer); + input, + params, + num_params, + weight_buf, + hx, + cx, + static_cast(mode), + hidden_size, + proj_size, + num_layers, + batch_first, + dropout_p, + train, + bidirectional, + /*batch_sizes=*/{}, + dropout_state.buffer); - return {std::get<0>(cudnn_output), - pack_hidden(std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; + return { + std::get<0>(cudnn_output), + pack_hidden( + std::get<1>(cudnn_output), std::get<2>(cudnn_output))}; } -#define ONE_HIDDEN_RNN(NAME, MODE) \ -void NAME##_cudnn(Tensor& output, Tensor& hy, \ - const Tensor& input, const Tensor& hx, \ - TensorList params, bool has_biases, \ - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { \ - std::tie(output, hy) = _cudnn_impl(input, hx, params, has_biases, \ - MODE, num_layers, dropout_p, train, bidirectional, batch_first); \ -} \ - \ -void NAME##_packed_cudnn(Tensor& output, Tensor& hy, \ - const Tensor& data, const Tensor& batch_sizes, const Tensor& hx, \ - TensorList params, bool has_biases, \ - int64_t num_layers, double dropout_p, bool train, bool bidirectional) { \ - std::tie(output, hy) = _cudnn_impl(data, batch_sizes, hx, params, \ - has_biases, MODE, num_layers, dropout_p, train, bidirectional); \ -} \ - \ -REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn); \ -REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn); +#define ONE_HIDDEN_RNN(NAME, MODE) \ + void NAME##_cudnn( \ + Tensor& output, \ + Tensor& hy, \ + const Tensor& input, \ + const Tensor& hx, \ + TensorList params, \ + bool has_biases, \ + int64_t num_layers, \ + double dropout_p, \ + bool train, \ + bool bidirectional, \ + bool batch_first) { \ + std::tie(output, hy) = _cudnn_impl( \ + input, \ + hx, \ + params, \ + has_biases, \ + MODE, \ + num_layers, \ + dropout_p, \ + train, \ + bidirectional, \ + batch_first); \ + } \ + \ + void NAME##_packed_cudnn( \ + Tensor& output, \ + Tensor& hy, \ + const Tensor& data, \ + const Tensor& batch_sizes, \ + const Tensor& hx, \ + TensorList params, \ + bool has_biases, \ + int64_t num_layers, \ + double dropout_p, \ + bool train, \ + bool bidirectional) { \ + std::tie(output, hy) = _cudnn_impl( \ + data, \ + batch_sizes, \ + hx, \ + params, \ + has_biases, \ + MODE, \ + num_layers, \ + dropout_p, \ + train, \ + bidirectional); \ + } \ + \ + REGISTER_CUDA_DISPATCH(NAME##_cudnn_stub, &NAME##_cudnn); \ + REGISTER_CUDA_DISPATCH(NAME##_packed_cudnn_stub, &NAME##_packed_cudnn); ONE_HIDDEN_RNN(gru, CUDNN_GRU) ONE_HIDDEN_RNN(rnn_tanh, CUDNN_RNN_TANH) ONE_HIDDEN_RNN(rnn_relu, CUDNN_RNN_RELU) -void lstm_cudnn(Tensor& output, Tensor& hy, Tensor& cy, - const Tensor& input, TensorList hx, - TensorList params, bool has_biases, - int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - auto result = _cudnn_impl(input, std::make_tuple(hx[0], hx[1]), params, has_biases, - CUDNN_LSTM, num_layers, dropout_p, train, bidirectional, batch_first); +void lstm_cudnn( + Tensor& output, + Tensor& hy, + Tensor& cy, + const Tensor& input, + TensorList hx, + TensorList params, + bool has_biases, + int64_t num_layers, + double dropout_p, + bool train, + bool bidirectional, + bool batch_first) { + auto result = _cudnn_impl( + input, + std::make_tuple(hx[0], hx[1]), + params, + has_biases, + CUDNN_LSTM, + num_layers, + dropout_p, + train, + bidirectional, + batch_first); output = result.first; hy = std::get<0>(result.second); cy = std::get<1>(result.second); } -void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy, - const Tensor& data, const Tensor& batch_sizes, TensorList hx, - TensorList params, bool has_biases, - int64_t num_layers, double dropout_p, bool train, bool bidirectional) { - auto result = _cudnn_impl(data, batch_sizes, std::make_tuple(hx[0], hx[1]), - params, has_biases, CUDNN_LSTM, num_layers, dropout_p, train, bidirectional); +void lstm_packed_cudnn( + Tensor& output, + Tensor& hy, + Tensor& cy, + const Tensor& data, + const Tensor& batch_sizes, + TensorList hx, + TensorList params, + bool has_biases, + int64_t num_layers, + double dropout_p, + bool train, + bool bidirectional) { + auto result = _cudnn_impl( + data, + batch_sizes, + std::make_tuple(hx[0], hx[1]), + params, + has_biases, + CUDNN_LSTM, + num_layers, + dropout_p, + train, + bidirectional); output = result.first; hy = std::get<0>(result.second); cy = std::get<1>(result.second); @@ -2159,8 +2763,9 @@ void lstm_packed_cudnn(Tensor& output, Tensor& hy, Tensor& cy, REGISTER_CUDA_DISPATCH(lstm_cudnn_stub, &lstm_cudnn); REGISTER_CUDA_DISPATCH(lstm_packed_cudnn_stub, &lstm_packed_cudnn); -} // anonymous namepsace +} // namespace -}} // namespace at::native +} // namespace at +} // namespace at #endif // AT_CUDNN_ENABLED() diff --git a/aten/src/ATen/native/cudnn/RNNUtils.h b/aten/src/ATen/native/cudnn/RNNUtils.h index 64a2ecbef62e1..7e2869a805740 100644 --- a/aten/src/ATen/native/cudnn/RNNUtils.h +++ b/aten/src/ATen/native/cudnn/RNNUtils.h @@ -1,9 +1,9 @@ #pragma once -#include #include #include #include +#include // Declares utilities used by RNN.cpp and also needed by external consumers namespace at { diff --git a/aten/src/ATen/native/layer_norm.cpp b/aten/src/ATen/native/layer_norm.cpp index 88d53da856d85..27a701dd2eb49 100644 --- a/aten/src/ATen/native/layer_norm.cpp +++ b/aten/src/ATen/native/layer_norm.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include #include @@ -18,6 +19,9 @@ #include #include #include +#include +#include +#include #include #endif @@ -258,4 +262,49 @@ std::tuple math_native_layer_norm( rstd = rstd.view(stat_shape); return std::make_tuple(out, mean, rstd); } + +Tensor rms_norm( + const Tensor& input, + IntArrayRef normalized_shape, + const c10::optional& weight_opt /* optional */, + c10::optional eps) { + + // See [Note: hacky wrapper removal for optional tensor] + c10::MaybeOwned weight_maybe_owned = at::borrow_from_optional_tensor(weight_opt); + const Tensor& weight = *weight_maybe_owned; + auto bias_opt = at::optional(); + const Tensor& bias = *at::borrow_from_optional_tensor(bias_opt); + (void) _check_layer_norm_inputs(input, normalized_shape, weight, bias); + + std::vector dims_to_reduce; + for (const auto i : c10::irange(normalized_shape.size())) { + dims_to_reduce.push_back(input.dim() - i - 1); + } + IntArrayRef dims_to_reduce_ref = IntArrayRef(dims_to_reduce); + + auto result = AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2( + at::ScalarType::Half, + at::ScalarType::BFloat16, + input.scalar_type(), + "rms_norm", + [&] { + scalar_t eps_val; + if (!eps.has_value()) { + eps_val = std::numeric_limits::type>::epsilon(); + } else { + eps_val = eps.value(); + } + + auto result = input.mul(at::rsqrt(at::pow(input, 2).mean(dims_to_reduce_ref, /*keep_dim=*/true).add_(eps_val))); + + if (weight_opt.has_value()) { + result = result.mul(weight_opt.value()); + } + + return result; + }); + + return result; + +} } // namespace at::native diff --git a/aten/src/ATen/native/layer_norm.h b/aten/src/ATen/native/layer_norm.h index 13fb1e4783d20..38e63569586e3 100644 --- a/aten/src/ATen/native/layer_norm.h +++ b/aten/src/ATen/native/layer_norm.h @@ -71,6 +71,12 @@ void layer_norm_cpu_out( int64_t M, int64_t N); +Tensor rms_norm( + const Tensor& input, + IntArrayRef normalized_shape, + const c10::optional& weight_opt /* optional */, + c10::optional eps); + using forward_fn = void (*)( const Tensor& /* X */, const Tensor& /* gamma */, diff --git a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm index 14c98f99cff02..926a52ffb8d20 100644 --- a/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm +++ b/aten/src/ATen/native/metal/mpscnn/MPSImageWrapper.mm @@ -40,8 +40,7 @@ - (void)endSynchronization:(NSError*)error { if (_imageWrapper) { _imageWrapper->release(); } - // throw an exception with error details - METAL_THROW_IF_ERROR(error, "Command buffer execution failed!"); + // T159183991: ignore error. We prefer to not crash the app. } } diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm index 04c82e94bda01..dbb5ee5c98de8 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm @@ -647,7 +647,7 @@ bool test_view2() { } bool test_view3() { - // nonarry -> array + // nonarray -> array __block std::vector size{5, 8}; return TEST(size, __PRETTY_FUNCTION__, ^bool { auto X1 = at::rand(size, at::TensorOptions(at::kCPU).dtype(at::kFloat)); diff --git a/aten/src/ATen/native/metal/ops/MetalSoftmax.mm b/aten/src/ATen/native/metal/ops/MetalSoftmax.mm index cfd55039cabd2..11ebe255953f2 100644 --- a/aten/src/ATen/native/metal/ops/MetalSoftmax.mm +++ b/aten/src/ATen/native/metal/ops/MetalSoftmax.mm @@ -20,7 +20,7 @@ Tensor mpscnn_softmax( int64_t dim, c10::optional dtype) { TORCH_CHECK(input.is_metal()); - // TODO: [T87180544] Implment softmax/log_softmax in metal shaders + // TODO: [T87180544] Implement softmax/log_softmax in metal shaders TORCH_CHECK(input.dim() == 2); if(input.numel() == 0){ return makeTensor({input.sizes().vec()}, input.options()); diff --git a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp index 6ae3b6ab143bc..7c641b3fadd89 100644 --- a/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp +++ b/aten/src/ATen/native/miopen/BatchNorm_miopen.cpp @@ -117,11 +117,14 @@ std::tuple miopen_batch_norm( save_var = at::empty({ num_features }, weight_t.options()); MIOPEN_CHECK(miopenBatchNormalizationForwardTraining( handle, mode, &one, &zero, - idesc.desc(), input->data_ptr(), + idesc.desc(), input->const_data_ptr(), idesc.desc(), output->data_ptr(), wdesc.desc(), - weight->data_ptr(), - bias->data_ptr(), + // NOTE: MIOpen docs say that the bnScale and bnBias args are only inputs, + // not outputs. However, unfortunately the function signature only takes + // non-const pointers, presumably by accident + const_cast(weight->const_data_ptr()), + const_cast(bias->const_data_ptr()), exponential_average_factor, at::maybe_data_ptr(running_mean), at::maybe_data_ptr(running_var), @@ -133,11 +136,14 @@ std::tuple miopen_batch_norm( save_var = at::empty({0}, weight_t.options()); MIOPEN_CHECK(miopenBatchNormalizationForwardInference( handle, mode, &one, &zero, - idesc.desc(), input->data_ptr(), + idesc.desc(), input->const_data_ptr(), idesc.desc(), output->data_ptr(), wdesc.desc(), - weight->data_ptr(), - bias->data_ptr(), + // NOTE: MIOpen docs say that the bnScale and bnBias args are only inputs, + // not outputs. However, unfortunately the function signature only takes + // non-const pointers, presumably by accident + const_cast(weight->const_data_ptr()), + const_cast(bias->const_data_ptr()), running_mean->data_ptr(), running_var->data_ptr(), epsilon)); @@ -216,15 +222,15 @@ std::tuple miopen_batch_norm_backward( MIOPEN_CHECK(miopenBatchNormalizationBackward( handle, mode, &one, &zero, &one, &zero, - idesc.desc(), input->data_ptr(), - idesc.desc(), grad_output->data_ptr(), + idesc.desc(), input->const_data_ptr(), + idesc.desc(), grad_output->const_data_ptr(), idesc.desc(), grad_input_t.data_ptr(), - wdesc.desc(), weight->data_ptr(), + wdesc.desc(), weight->const_data_ptr(), grad_weight_t.data_ptr(), grad_bias_t.data_ptr(), epsilon, - save_mean->data_ptr(), - save_var->data_ptr())); + save_mean->const_data_ptr(), + save_var->const_data_ptr())); return std::tuple{grad_input_t, grad_weight_t, grad_bias_t}; } diff --git a/aten/src/ATen/native/miopen/Conv_miopen.cpp b/aten/src/ATen/native/miopen/Conv_miopen.cpp index 4a192cae2a20d..88f889c2cc1fa 100644 --- a/aten/src/ATen/native/miopen/Conv_miopen.cpp +++ b/aten/src/ATen/native/miopen/Conv_miopen.cpp @@ -371,8 +371,8 @@ struct algorithm_search { Workspace ws(max_ws_size); MIOPEN_CHECK(miopenFindConvolutionForwardAlgorithm( args.handle, - args.idesc.desc(), args.input.data_ptr(), - args.wdesc.desc(), args.weight.data_ptr(), + args.idesc.desc(), args.input.const_data_ptr(), + args.wdesc.desc(), args.weight.const_data_ptr(), args.cdesc.desc(), args.odesc.desc(), args.output.data_ptr(), 1, // just return the fastest @@ -444,8 +444,8 @@ struct algorithm_search { Workspace ws(max_ws_size); MIOPEN_CHECK(miopenFindConvolutionBackwardDataAlgorithm( args.handle, - args.odesc.desc(), args.output.data_ptr(), - args.wdesc.desc(), args.weight.data_ptr(), + args.odesc.desc(), args.output.const_data_ptr(), + args.wdesc.desc(), args.weight.const_data_ptr(), args.cdesc.desc(), args.idesc.desc(), args.input.data_ptr(), 1, // just return the fastest @@ -517,8 +517,8 @@ struct algorithm_search { Workspace ws(max_ws_size); MIOPEN_CHECK(miopenFindConvolutionBackwardWeightsAlgorithm( args.handle, - args.odesc.desc(), args.output.data_ptr(), - args.idesc.desc(), args.input.data_ptr(), + args.odesc.desc(), args.output.const_data_ptr(), + args.idesc.desc(), args.input.const_data_ptr(), args.cdesc.desc(), args.wdesc.desc(), args.weight.data_ptr(), 1, // just return the fastest @@ -599,7 +599,10 @@ void findAlgorithm(const ConvolutionArgs& args, bool benchmark, algo_t* algo) { cache.insert(args.params, *algo); wsscache.insert(args.params, perfResults.memory); - c10::hip::HIPCachingAllocator::emptyCache(); + if (at::native::_cudnn_get_conv_benchmark_empty_cache()) { + c10::hip::HIPCachingAllocator::emptyCache(); + } + } template @@ -682,7 +685,7 @@ void miopen_convolution_add_bias_(CheckedFrom c, const TensorArg& output, const Constant one(dataType, 1); Constant zero(dataType, 0); - MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->data_ptr(), + MIOPEN_CHECK(miopenConvolutionForwardBias(handle, &one, bdesc.desc(), bias->const_data_ptr(), &zero, odesc.desc(), output->data_ptr())); */ } @@ -730,8 +733,8 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForward( args.handle, - &one, args.idesc.desc(), input.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + &one, args.idesc.desc(), input.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), fwdAlg, &zero, args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); } @@ -741,8 +744,8 @@ void raw_miopen_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForwardImmediate( args.handle, - args.wdesc.desc(), weight.data_ptr(), - args.idesc.desc(), input.data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -838,8 +841,8 @@ void raw_miopen_depthwise_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForward( args.handle, - &one, args.idesc.desc(), input.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + &one, args.idesc.desc(), input.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), fwdAlg, &zero, args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size)); } @@ -849,8 +852,8 @@ void raw_miopen_depthwise_convolution_forward_out( MIOPEN_CHECK(miopenConvolutionForwardImmediate( args.handle, - args.wdesc.desc(), weight.data_ptr(), - args.idesc.desc(), input.data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), args.odesc.desc(), output.data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -993,8 +996,8 @@ void raw_miopen_convolution_backward_weight_out( MIOPEN_CHECK(miopenConvolutionBackwardWeights( args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), + &one, args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), bwdFilterAlg, &zero, args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); } @@ -1004,8 +1007,8 @@ void raw_miopen_convolution_backward_weight_out( MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( args.handle, - args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), + args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -1037,8 +1040,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out( MIOPEN_CHECK(miopenConvolutionBackwardWeights( args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), + &one, args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), bwdFilterAlg, &zero, args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size)); } @@ -1048,8 +1051,8 @@ void raw_miopen_depthwise_convolution_backward_weight_out( MIOPEN_CHECK(miopenConvolutionBackwardWeightsImmediate( args.handle, - args.odesc.desc(), grad_output.data_ptr(), - args.idesc.desc(), input.data_ptr(), + args.odesc.desc(), grad_output.const_data_ptr(), + args.idesc.desc(), input.const_data_ptr(), args.cdesc.desc(), args.wdesc.desc(), grad_weight.data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -1242,8 +1245,8 @@ void raw_miopen_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardData( args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + &one, args.odesc.desc(), grad_output.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), bwdDataAlg, &zero, args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); } @@ -1253,8 +1256,8 @@ void raw_miopen_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate( args.handle, - args.odesc.desc(), grad_output.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + args.odesc.desc(), grad_output.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -1351,8 +1354,8 @@ void raw_miopen_depthwise_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardData( args.handle, - &one, args.odesc.desc(), grad_output.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + &one, args.odesc.desc(), grad_output.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), bwdDataAlg, &zero, args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size)); } @@ -1362,8 +1365,8 @@ void raw_miopen_depthwise_convolution_backward_input_out( MIOPEN_CHECK(miopenConvolutionBackwardDataImmediate( args.handle, - args.odesc.desc(), grad_output.data_ptr(), - args.wdesc.desc(), weight.data_ptr(), + args.odesc.desc(), grad_output.const_data_ptr(), + args.wdesc.desc(), weight.const_data_ptr(), args.cdesc.desc(), args.idesc.desc(), grad_input.mutable_data_ptr(), workspace.data, workspace.size, solution_id)); } @@ -1528,11 +1531,11 @@ void raw_miopen_convolution_relu_out( float activ_gamma = static_cast(0); miopenOperatorArgs_t fusionArgs; MIOPEN_CHECK(miopenCreateOperatorArgs(&fusionArgs)); - MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.data_ptr())); - MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.data_ptr())); + MIOPEN_CHECK(miopenSetOpArgsConvForward(fusionArgs, convoOp, &alpha, &beta, weight.const_data_ptr())); + MIOPEN_CHECK(miopenSetOpArgsBiasForward(fusionArgs, biasOp, &alpha, &beta, bias.const_data_ptr())); MIOPEN_CHECK(miopenSetOpArgsActivForward(fusionArgs, activOp, &alpha, &beta, activ_alpha, activ_beta, activ_gamma)); - miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs); + miopenExecuteFusionPlan(args.handle, fusePlanDesc, args.idesc.desc(), input.const_data_ptr(), args.odesc.desc(), output.data_ptr(), fusionArgs); // Cleanup miopenDestroyFusionPlan(fusePlanDesc); diff --git a/aten/src/ATen/native/miopen/RNN_miopen.cpp b/aten/src/ATen/native/miopen/RNN_miopen.cpp index 51518fbfeb42a..7b2b2ab80e553 100644 --- a/aten/src/ATen/native/miopen/RNN_miopen.cpp +++ b/aten/src/ATen/native/miopen/RNN_miopen.cpp @@ -500,9 +500,7 @@ std::tuple miopen_rnn( auto weight_buf = at::empty(num_weights, x.options()); w_desc.set(weight_buf, 3); weight_buf.zero_(); - std::vector params; - size_t params_stride0; - std::tie(params, params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf); + auto [params, params_stride0] = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, weight_buf); if (fn_mode < 2) _copyParams(MatrixRef{weight, static_cast(weight_stride0)}, MatrixRef{params, params_stride0}); @@ -742,9 +740,7 @@ std::vector miopen_rnn_backward_weight( fn_reserve.data_ptr(), fn_reserve.size(0) )); - std::vector grad_params_arr; - size_t grad_params_stride0; - std::tie(grad_params_arr, grad_params_stride0) = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw); + auto [grad_params_arr, grad_params_stride0] = get_parameters(handle, fn.rnn, descs.rnn_desc, descs.x_descs[0], w_desc, dw); if (grad_params_stride0 == static_cast(weight_stride0)) { _viewParams(MatrixRef{grad_params_arr, grad_params_stride0}, MatrixRef{weight_arr, static_cast(weight_stride0)}); @@ -782,8 +778,7 @@ std::tuple> miopen_rnn_backward( auto grad_hy = grad_hy_r.defined() ? grad_hy_r : at::zeros_like(hx, LEGACY_CONTIGUOUS_MEMORY_FORMAT); auto grad_cy = cx.defined() ? (grad_cy_r.defined() ? grad_cy_r : at::zeros_like(cx, LEGACY_CONTIGUOUS_MEMORY_FORMAT)) : grad_cy_r; - Tensor dx, dhx, dcx, ws; - std::tie(dx, dhx, dcx, ws) = at::native::miopen_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]}); + auto [dx, dhx, dcx, ws] = at::native::miopen_rnn_backward_input(input, weight_buf, hx, cx, output, grad_output, grad_hy, grad_cy, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, {output_mask[0], output_mask[1], output_mask[2]}); std::vector dw; if (output_mask[3]) { dw = at::native::miopen_rnn_backward_weight(input, weight, weight_stride0, weight_buf, hx, cx, output, mode, hidden_size, num_layers, batch_first, dropout, train, bidirectional, batch_sizes, dropout_state, reserve, ws); @@ -828,8 +823,7 @@ std::pair _miopen_impl( const Tensor& input, const Tensor& _batch_sizes, const hidden_type& hidden, TensorList params, bool has_biases, miopenRNNMode_t mode, int64_t num_layers, double dropout_p, bool train, bool bidirectional) { - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); + auto [hx, cx] = unpack_hidden(hidden); int64_t hidden_size = hx.size(2); TORCH_CHECK(_batch_sizes.dim() == 1, "batch_sizes tensor should be 1D"); @@ -851,8 +845,7 @@ std::pair _miopen_impl( const Tensor& input, const hidden_type& hidden, TensorList params, bool has_biases, miopenRNNMode_t mode, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); + auto [hx, cx] = unpack_hidden(hidden); int64_t hidden_size = hx.size(2); Tensor dropout_state = at::empty({0}, input.options()); @@ -915,7 +908,7 @@ void lstm_packed_miopen(Tensor& output, Tensor& hy, Tensor& cy, REGISTER_CUDA_DISPATCH(lstm_miopen_stub, &lstm_miopen); REGISTER_CUDA_DISPATCH(lstm_packed_miopen_stub, &lstm_packed_miopen); -} // anonymous namepsace +} // anonymous namespace }} //namespace native. #endif diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.cpp b/aten/src/ATen/native/mkl/LinearAlgebra.cpp index ce8d848ec1ce6..5e82613f42285 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.cpp +++ b/aten/src/ATen/native/mkl/LinearAlgebra.cpp @@ -8,42 +8,50 @@ namespace at { namespace native { void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const float alpha, - const float** A, const int lda, const float** B, const int ldb, const float beta, - float** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const float alpha, + const float** A, const MKL_INT lda, const float** B, const MKL_INT ldb, const float beta, + float** C, const MKL_INT ldc) { TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support"); } void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const double alpha, - const double** A, const int lda, const double** B, const int ldb, const double beta, - double** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const double alpha, + const double** A, const MKL_INT lda, const double** B, const MKL_INT ldb, const double beta, + double** C, const MKL_INT ldc) { TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support"); } void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const c10::complex alpha, - const c10::complex** A, const int lda, const c10::complex** B, const int ldb, - const c10::complex beta, c10::complex** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex alpha, + const c10::complex** A, const MKL_INT lda, const c10::complex** B, const MKL_INT ldb, + const c10::complex beta, c10::complex** C, const MKL_INT ldc) { TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support"); } void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const c10::complex alpha, - const c10::complex** A, const int lda, const c10::complex** B, const int ldb, - const c10::complex beta, c10::complex** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex alpha, + const c10::complex** A, const MKL_INT lda, const c10::complex** B, const MKL_INT ldb, + const c10::complex beta, c10::complex** C, const MKL_INT ldc) { TORCH_INTERNAL_ASSERT(false, "mkl_gemm_batched: ATen not compiled with MKL support"); } void mkl_gemm_bf16bf16f32( + TransposeType trans_A, TransposeType trans_B, + MKL_INT M, MKL_INT N, MKL_INT K, const float alpha, + const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb, + const float beta, float* C, MKL_INT ldc) { + TORCH_INTERNAL_ASSERT(false, "mkl_gemm_bf16bf16f32: ATen not compiled with MKL support"); +} + +void mkl_gemm_f16f16f32( TransposeType trans_A, TransposeType trans_B, int M, int N, int K, const float alpha, - const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb, + const c10::Half* A, int lda, const c10::Half* B, int ldb, const float beta, float* C, int ldc) { - TORCH_INTERNAL_ASSERT(false, "mkl_gemm_bf16bf16f32: ATen not compiled with MKL support"); + TORCH_INTERNAL_ASSERT(false, "mkl_gemm_f16f16f32: ATen not compiled with MKL support"); } }} @@ -66,9 +74,9 @@ static CBLAS_TRANSPOSE to_cblas(TransposeType x) { void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const float alpha, - const float** A, const int lda, const float** B, const int ldb, const float beta, - float** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const float alpha, + const float** A, const MKL_INT lda, const float** B, const MKL_INT ldb, const float beta, + float** C, const MKL_INT ldc) { auto transa_cblas = to_cblas(trans_A); auto transb_cblas = to_cblas(trans_B); cblas_sgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, &alpha, @@ -77,9 +85,9 @@ void mkl_gemm_batched( void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const double alpha, - const double** A, const int lda, const double** B, const int ldb, const double beta, - double** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const double alpha, + const double** A, const MKL_INT lda, const double** B, const MKL_INT ldb, const double beta, + double** C, const MKL_INT ldc) { auto transa_cblas = to_cblas(trans_A); auto transb_cblas = to_cblas(trans_B); cblas_dgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, &alpha, @@ -88,9 +96,9 @@ void mkl_gemm_batched( void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const c10::complex alpha, - const c10::complex** A, const int lda, const c10::complex** B, const int ldb, - const c10::complex beta, c10::complex** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex alpha, + const c10::complex** A, const MKL_INT lda, const c10::complex** B, const MKL_INT ldb, + const c10::complex beta, c10::complex** C, const MKL_INT ldc) { auto transa_cblas = to_cblas(trans_A); auto transb_cblas = to_cblas(trans_B); cblas_cgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, @@ -101,9 +109,9 @@ void mkl_gemm_batched( void mkl_gemm_batched( const TransposeType trans_A, const TransposeType trans_B, - const int batch_size, const int M, const int N, const int K, const c10::complex alpha, - const c10::complex** A, const int lda, const c10::complex** B, const int ldb, - const c10::complex beta, c10::complex** C, const int ldc) { + const MKL_INT batch_size, const MKL_INT M, const MKL_INT N, const MKL_INT K, const c10::complex alpha, + const c10::complex** A, const MKL_INT lda, const c10::complex** B, const MKL_INT ldb, + const c10::complex beta, c10::complex** C, const MKL_INT ldc) { auto transa_cblas = to_cblas(trans_A); auto transb_cblas = to_cblas(trans_B); cblas_zgemm_batch(CblasColMajor, &transa_cblas, &transb_cblas, &M, &N, &K, @@ -114,9 +122,9 @@ void mkl_gemm_batched( void mkl_gemm_bf16bf16f32( TransposeType trans_A, TransposeType trans_B, - int M, int N, int K, const float alpha, - const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb, - const float beta, float* C, int ldc) { + MKL_INT M, MKL_INT N, MKL_INT K, const float alpha, + const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb, + const float beta, float* C, MKL_INT ldc) { #ifdef MKL_HAS_SBGEMM auto transa_cblas = to_cblas(trans_A); auto transb_cblas = to_cblas(trans_B); @@ -127,6 +135,21 @@ void mkl_gemm_bf16bf16f32( #endif } +void mkl_gemm_f16f16f32( + TransposeType trans_A, TransposeType trans_B, + int M, int N, int K, const float alpha, + const c10::Half* A, int lda, const c10::Half* B, int ldb, + const float beta, float* C, int ldc) { +#ifdef MKL_HAS_SHGEMM + auto transa_cblas = to_cblas(trans_A); + auto transb_cblas = to_cblas(trans_B); + cblas_gemm_f16f16f32(CblasColMajor, transa_cblas, transb_cblas, M, N, K, alpha, + (const MKL_F16*)A, lda, (const MKL_F16*)B, ldb, beta, C, ldc); +#else + TORCH_INTERNAL_ASSERT(false, "mkl_gemm_f16f16f32 requires mkl version >= 2024.0"); +#endif +} + }} // namespace at::native #endif diff --git a/aten/src/ATen/native/mkl/LinearAlgebra.h b/aten/src/ATen/native/mkl/LinearAlgebra.h index a881e7ba49b1e..54a7a5cb2c741 100644 --- a/aten/src/ATen/native/mkl/LinearAlgebra.h +++ b/aten/src/ATen/native/mkl/LinearAlgebra.h @@ -1,39 +1,51 @@ #pragma once +#include #include #include #include +#if !AT_MKL_ENABLED() +#define MKL_INT int +#else +#include +#endif + namespace at { namespace native { void mkl_gemm_batched( TransposeType trans_A, TransposeType trans_B, - int batch_size, int M, int N, int K, float alpha, - const float** A, int lda, const float** B, int ldb, float beta, - float** C, int ldc); + MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, float alpha, + const float** A, MKL_INT lda, const float** B, MKL_INT ldb, float beta, + float** C, MKL_INT ldc); void mkl_gemm_batched( TransposeType trans_A, TransposeType trans_B, - int batch_size, int M, int N, int K, double alpha, - const double** A, int lda, const double** B, int ldb, double beta, - double** C, int ldc); + MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, double alpha, + const double** A, MKL_INT lda, const double** B, MKL_INT ldb, double beta, + double** C, MKL_INT ldc); void mkl_gemm_batched( TransposeType trans_A, TransposeType trans_B, - int batch_size, int M, int N, int K, c10::complex alpha, - const c10::complex** A, int lda, const c10::complex** B, int ldb, - c10::complex beta, c10::complex** C, int ldc); + MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, c10::complex alpha, + const c10::complex** A, MKL_INT lda, const c10::complex** B, MKL_INT ldb, + c10::complex beta, c10::complex** C, MKL_INT ldc); void mkl_gemm_batched( TransposeType trans_A, TransposeType trans_B, - int batch_size, int M, int N, int K, c10::complex alpha, - const c10::complex** A, int lda, const c10::complex** B, int ldb, - c10::complex beta, c10::complex** C, int ldc); + MKL_INT batch_size, MKL_INT M, MKL_INT N, MKL_INT K, c10::complex alpha, + const c10::complex** A, MKL_INT lda, const c10::complex** B, MKL_INT ldb, + c10::complex beta, c10::complex** C, MKL_INT ldc); void mkl_gemm_bf16bf16f32( + TransposeType trans_A, TransposeType trans_B, + MKL_INT M, MKL_INT N, MKL_INT K, const float alpha, + const c10::BFloat16* A, MKL_INT lda, const c10::BFloat16* B, MKL_INT ldb, + const float beta, float* C, MKL_INT ldc); + +void mkl_gemm_f16f16f32( TransposeType trans_A, TransposeType trans_B, int M, int N, int K, const float alpha, - const c10::BFloat16* A, int lda, const c10::BFloat16* B, int ldb, + const c10::Half* A, int lda, const c10::Half* B, int ldb, const float beta, float* C, int ldc); - }} // namespace at::native diff --git a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp index 3d0acb9aae751..b938ccd937a8d 100644 --- a/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp +++ b/aten/src/ATen/native/mkl/SparseCsrLinearAlgebra.cpp @@ -2,12 +2,11 @@ #include #include -// Don't compile with MKL for MSVC/macos since linking the sparse MKL routines +// Don't compile with MKL for macos since linking the sparse MKL routines // needs some build fixes. -// https://github.com/pytorch/pytorch/pull/50937#issuecomment-778732740 // Macros source: // https://web.archive.org/web/20191012035921/http://nadeausoftware.com/articles/2012/01/c_c_tip_how_use_compiler_predefined_macros_detect_operating_system -#if !AT_MKL_ENABLED() || defined(_MSC_VER) || defined(__APPLE__) || \ +#if !AT_MKL_ENABLED() || defined(__APPLE__) || \ defined(__MACH__) namespace at { @@ -19,9 +18,7 @@ Tensor& _sparse_mm_mkl_( const Tensor& t, const Scalar& alpha, const Scalar& beta) { -#if _MSC_VER - AT_ERROR("sparse_mm_mkl: MKL support is disabled on Windows"); -#elif __APPLE__ || __MACH__ +#if __APPLE__ || __MACH__ AT_ERROR("sparse_mm_mkl: MKL support is disabled on macos/iOS."); #else AT_ERROR("sparse_mm_mkl: ATen not compiled with MKL support"); diff --git a/aten/src/ATen/native/mkl/SpectralOps.cpp b/aten/src/ATen/native/mkl/SpectralOps.cpp index cb00ce99d82e6..e26cfbf6d8eba 100644 --- a/aten/src/ATen/native/mkl/SpectralOps.cpp +++ b/aten/src/ATen/native/mkl/SpectralOps.cpp @@ -200,7 +200,7 @@ Tensor& _fft_c2c_mkl_out(const Tensor& self, IntArrayRef dim, int64_t normalizat } }} // namespace at::native -#endif /* AT_MKL_ENALED() || AT_POCKETFFT_ENABLED() */ +#endif /* AT_MKL_ENABLED() || AT_POCKETFFT_ENABLED() */ #if AT_POCKETFFT_ENABLED() #include @@ -229,7 +229,7 @@ inline std::complex *tensor_cdata(Tensor& t) { template inline const std::complex *tensor_cdata(const Tensor& t) { - return reinterpret_cast*>(t.data_ptr>()); + return reinterpret_cast*>(t.const_data_ptr>()); } template @@ -291,11 +291,11 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, pocketfft::shape_t axes(dim.begin(), dim.end()); if (self.scalar_type() == kFloat) { pocketfft::r2c(shape_from_tensor(self), stride_from_tensor(self), stride_from_tensor(out), axes, true, - self.data_ptr(), + self.const_data_ptr(), tensor_cdata(out), compute_fct(self, dim, normalization)); } else { pocketfft::r2c(shape_from_tensor(self), stride_from_tensor(self), stride_from_tensor(out), axes, true, - self.data_ptr(), + self.const_data_ptr(), tensor_cdata(out), compute_fct(self, dim, normalization)); } @@ -307,6 +307,10 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { TORCH_CHECK(self.is_complex()); + if (dim.empty()) { + return self.clone(); + } + auto out = at::empty(self.sizes(), self.options()); pocketfft::shape_t axes(dim.begin(), dim.end()); if (self.scalar_type() == kComplexFloat) { @@ -480,9 +484,9 @@ static Tensor& _exec_fft(Tensor& out, const Tensor& self, IntArrayRef out_sizes, // run the FFT if (forward) { - MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), input.data_ptr(), out.data_ptr())); + MKL_DFTI_CHECK(DftiComputeForward(descriptor.get(), const_cast(input.const_data_ptr()), out.data_ptr())); } else { - MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), input.data_ptr(), out.data_ptr())); + MKL_DFTI_CHECK(DftiComputeBackward(descriptor.get(), const_cast(input.const_data_ptr()), out.data_ptr())); } // Inplace reshaping to original batch shape and inverting the dimension permutation @@ -556,6 +560,10 @@ Tensor _fft_r2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, // n-dimensional complex to complex FFT/IFFT Tensor _fft_c2c_mkl(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { TORCH_CHECK(self.is_complex()); + if (dim.empty()) { + return self.clone(); + } + const auto sorted_dims = _sort_dims(self, dim); auto out = at::empty(self.sizes(), self.options()); return _exec_fft(out, self, self.sizes(), sorted_dims, normalization, forward); diff --git a/aten/src/ATen/native/mkldnn/Conv.cpp b/aten/src/ATen/native/mkldnn/Conv.cpp index b5e53732a472e..3e41e2f1071d0 100644 --- a/aten/src/ATen/native/mkldnn/Conv.cpp +++ b/aten/src/ATen/native/mkldnn/Conv.cpp @@ -223,10 +223,10 @@ static void _mkldnn_convolution_out ( auto memory_format = mkldnn_convolution_memory_format(input_t.ndimension(), is_channels_last); auto input = input_t.is_mkldnn() ? input_t : input_t.contiguous(memory_format); auto weight = weight_t.is_mkldnn() ? weight_t : weight_t.contiguous(memory_format); - const ideep::tensor x = itensor_from_tensor(input); - const ideep::tensor w = itensor_from_tensor(weight); + const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true); + const ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true); if (bias.defined()) { - const ideep::tensor b = itensor_from_tensor(bias); + const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true); ideep::convolution_forward::compute_v3( x, w, @@ -704,9 +704,9 @@ Tensor _mkldnn_convolution_transpose( auto output_sizes = conv_input_size(input.sizes(), weight_IOHW_sizes, padding_expanded, output_padding_expanded, stride_expanded, dilation_expanded, groups); auto output = at::empty({0}, input.options()); - const ideep::tensor x = itensor_from_tensor(input); + const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true); - ideep::tensor w = itensor_from_tensor(weight); + ideep::tensor w = itensor_from_tensor(weight, /*from_const_data_ptr*/true); if (!weight.is_mkldnn()) { // mkldnn transposed convolution has weight in logical order of OIHW or OIDHW, // while PyTorch has IOHW or IODHW, `._tranpose()` switches strides (no memory copy). @@ -720,7 +720,7 @@ Tensor _mkldnn_convolution_transpose( } if (bias.defined()) { - const ideep::tensor b = itensor_from_tensor(bias); + const ideep::tensor b = itensor_from_tensor(bias, /*from_const_data_ptr*/true); ideep::convolution_transpose_forward::compute_v3( x, w, @@ -825,8 +825,8 @@ Tensor mkldnn_convolution_backward_input( bool is_channels_last) { auto grad_input = at::empty({0}, grad_output.options()); - auto grad_y = itensor_from_tensor(grad_output); - auto w = itensor_view_from_dense(weight); + auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true); + auto w = itensor_view_from_dense(weight, /*from_const_data_ptr*/true); ideep::tensor grad_x; if (is_channels_last) { @@ -865,8 +865,8 @@ std::tuple mkldnn_convolution_backward_weights( int64_t groups, bool bias_defined, bool is_channels_last) { - const ideep::tensor grad_y = itensor_from_tensor(grad_output); - const ideep::tensor x = itensor_from_tensor(input); + const ideep::tensor grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true); + const ideep::tensor x = itensor_from_tensor(input, /*from_const_data_ptr*/true); ideep::tensor grad_w, grad_b; if (bias_defined) { @@ -975,8 +975,8 @@ Tensor mkldnn_convolution_transpose_backward_input( bool is_channels_last) { auto grad_input = at::empty({0}, grad_output.options()); - auto grad_y = itensor_from_tensor(grad_output); - auto w = itensor_view_from_dense(weight).transpose_(0, 1); + auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true); + auto w = itensor_view_from_dense(weight, /*from_const_data_ptr*/true).transpose_(0, 1); ideep::tensor grad_x; if (is_channels_last) { @@ -1016,8 +1016,8 @@ std::tuple mkldnn_convolution_transpose_backward_weights( int64_t groups, bool bias_defined, bool is_channels_last) { - auto grad_y = itensor_from_tensor(grad_output); - auto x = itensor_from_tensor(input); + auto grad_y = itensor_from_tensor(grad_output, /*from_const_data_ptr*/true); + auto x = itensor_from_tensor(input, /*from_const_data_ptr*/true); ideep::tensor grad_w, grad_b; if (bias_defined) { diff --git a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp index 5977b045951f6..f102756ebbb93 100644 --- a/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp +++ b/aten/src/ATen/native/mkldnn/IDeepRegistration.cpp @@ -30,4 +30,4 @@ void clear_computation_cache() { } // namespace at::native::mkldnn -#endif // AT_MKLDNN_ENALBED() +#endif // AT_MKLDNN_ENABLED() diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp index 054953635591d..061d154f3b40f 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp +++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.cpp @@ -81,7 +81,7 @@ ideep::tensor& itensor_from_mkldnn(const MKLDNNTensor& mkldnn_tensor) { return mklimpl->unsafe_opaque_handle()->get_target(); } -ideep::tensor itensor_view_from_dense(const Tensor& tensor) { +ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data_ptr) { TORCH_CHECK( tensor.device().is_cpu(), "itensor_view_from_dense expects CPU tensor input"); @@ -92,31 +92,41 @@ ideep::tensor itensor_view_from_dense(const Tensor& tensor) { return {{tensor.sizes().vec(), ideep::tensor::data_type::f32, tensor.strides().vec()}, - tensor.template data_ptr()}; + from_const_data_ptr ? + const_cast(tensor.template const_data_ptr()) : + tensor.template data_ptr()}; } else if (tensor.scalar_type() == ScalarType::BFloat16) { return {{tensor.sizes().vec(), ideep::tensor::data_type::bf16, tensor.strides().vec()}, - tensor.template data_ptr()}; + from_const_data_ptr ? + const_cast(tensor.template const_data_ptr()) : + tensor.template data_ptr()}; } else if (tensor.scalar_type() == ScalarType::Half) { return {{tensor.sizes().vec(), ideep::tensor::data_type::f16, tensor.strides().vec()}, - tensor.template data_ptr()}; + from_const_data_ptr ? + const_cast(tensor.template const_data_ptr()) : + tensor.template data_ptr()}; } else if (tensor.scalar_type() == ScalarType::Byte) { return {{tensor.sizes().vec(), ideep::tensor::data_type::u8, tensor.strides().vec()}, - tensor.data_ptr()}; + from_const_data_ptr ? + const_cast(tensor.const_data_ptr()) : + tensor.data_ptr()}; } else if (tensor.scalar_type() == ScalarType::Char) { return {{tensor.sizes().vec(), ideep::tensor::data_type::s8, tensor.strides().vec()}, - tensor.data_ptr()}; + from_const_data_ptr ? + const_cast(tensor.const_data_ptr()) : + tensor.data_ptr()}; } else { TORCH_CHECK(false, "itensor_view_from_dense expects float/bfloat16/half/int8 tensor input"); @@ -145,11 +155,11 @@ ideep::tensor itensor_view_from_dense( // tensor is just a view of the storage of the aten dense tensor, so // caller needs to make sure the aten dense tensor's lifetime is // longer than the ideep tensor. -ideep::tensor itensor_from_tensor(const Tensor& tensor) { +ideep::tensor itensor_from_tensor(const Tensor& tensor, bool from_const_data_ptr) { if (tensor.is_mkldnn()) { return itensor_from_mkldnn(tensor); } else { - return itensor_view_from_dense(tensor); + return itensor_view_from_dense(tensor, from_const_data_ptr); } } diff --git a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h index 86fc25c0f2fcd..5e9044ce908aa 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNCommon.h +++ b/aten/src/ATen/native/mkldnn/MKLDNNCommon.h @@ -36,7 +36,7 @@ TORCH_API ideep::tensor& itensor_from_mkldnn(const Tensor& mkldnn_tensor); // Construct an `ideep::tensor` "view" from dense tensor, note the // ideep::tensor will share the underlying buffer -TORCH_API ideep::tensor itensor_view_from_dense(const Tensor& tensor); +TORCH_API ideep::tensor itensor_view_from_dense(const Tensor& tensor, bool from_const_data_ptr=false); // Construct an `ideep::tensor` "view" from dense tensor using given desc, note // the ideep::tensor will share the underlying buffer @@ -45,7 +45,7 @@ TORCH_API ideep::tensor itensor_view_from_dense( const ideep::tensor::desc& desc); // Helper function for getting an ideep tensor out of an aten Tensor or MKL-DNN tensor. -TORCH_API ideep::tensor itensor_from_tensor(const Tensor& tensor); +TORCH_API ideep::tensor itensor_from_tensor(const Tensor& tensor, bool from_const_data_ptr=false); // Set MKLDNN verbose level TORCH_API int set_verbose(int level); diff --git a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp index 1f92705171f6d..b2901bc522be2 100644 --- a/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp +++ b/aten/src/ATen/native/mkldnn/MKLDNNConversions.cpp @@ -198,24 +198,40 @@ Tensor mkldnn_reorder_conv3d_weight( IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups) { + int64_t groups, + c10::OptionalArrayRef input_size) { mkldnn_check_low_precision(self.scalar_type(), "mkldnn_reorder_conv3d_weight"); const auto padding_expanded = expand_param_if_needed(padding, "padding", 3); const auto stride_expanded = expand_param_if_needed(stride, "stride", 3); const auto dilation_expanded = expand_param_if_needed(dilation, "dilation", 3); - auto w = itensor_from_mkldnn(self); - - auto desc = - ideep::convolution_forward::expected_weights_desc( - w.get_dims(), - w.get_data_type(), - stride_expanded, - padding_expanded, - padding_expanded, - dilation_expanded, - groups, - ideep::algorithm::convolution_direct); + ideep::dims src_dims = ideep::dims(); + bool is_channels_last = false; + auto memory_format = at::MemoryFormat::Contiguous; + if (input_size.has_value()) { + src_dims = input_size.value().vec(); + // if has input size, we always use channels last. + is_channels_last = true; + memory_format = at::MemoryFormat::ChannelsLast3d; + } + + auto self_ = self.is_mkldnn() ? self : self.contiguous(memory_format); + auto w = itensor_from_tensor(self_); + + auto desc = ideep::convolution_forward::expected_weights_desc( + w.get_dims(), + w.get_data_type(), + stride_expanded, + padding_expanded, + padding_expanded, + dilation_expanded, + groups, + ideep::algorithm::convolution_direct, + ideep::prop_kind::forward, + w.get_data_type(), + src_dims, + ideep::attr_t(), + is_channels_last); ideep::tensor result; result.init(desc); result.feed_from(w); @@ -223,6 +239,21 @@ Tensor mkldnn_reorder_conv3d_weight( return new_with_itensor_mkldnn(std::move(result), optTypeMetaToScalarType(self.options().dtype_opt()), self.options().device_opt()); } +static Tensor mkldnn_reorder_conv_weight( + const Tensor& self, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + c10::OptionalArrayRef input_size) { + TORCH_CHECK((self.dim() == 4 || self.dim() == 5), "mkldnn_reorder_conv_weight only supports conv2d and conv3d"); + if (self.dim() == 4) { + return at::native::mkldnn_reorder_conv2d_weight(self, padding, stride, dilation, groups, input_size); + } else { + return at::native::mkldnn_reorder_conv3d_weight(self, padding, stride, dilation, groups, input_size); + } +} + static Tensor mkldnn_reorder_linear_weight( const Tensor& self, c10::optional batch_size_opt) { @@ -389,9 +420,7 @@ static std::tuple get_lstm_packed_weights( get_mkldnn_dtype(weight_hh.scalar_type()), ideep::format_tag::ldgoi}); - ideep::tensor::desc packed_desc_ih, packed_desc_hh; - - std::tie(packed_desc_ih, packed_desc_hh) = + auto [packed_desc_ih, packed_desc_hh] = ideep::lstm_forward_inference::expected_weights_desc( output_sizes, src_layer, @@ -443,12 +472,11 @@ static std::vector mkldnn_reorder_mkldnn_rnn_layer_weight( batch_size = 10; } - ideep::tensor w1_, w2_; at::Tensor packed_w1, packed_w2; int64_t feature_size = weight0.size(-1); - std::tie(w1_, w2_) = get_lstm_packed_weights( + auto [w1_, w2_] = get_lstm_packed_weights( weight0, weight1, at::zeros( @@ -489,7 +517,7 @@ TORCH_LIBRARY_IMPL(mkldnn, CPU, m) { TORCH_FN(mkldnn_reorder_linear_weight)); m.impl( TORCH_SELECTIVE_NAME("mkldnn::_reorder_convolution_weight"), - TORCH_FN(mkldnn_reorder_conv2d_weight)); + TORCH_FN(mkldnn_reorder_conv_weight)); m.impl( TORCH_SELECTIVE_NAME("mkldnn::_reorder_mkldnn_rnn_layer_weight"), TORCH_FN(mkldnn_reorder_mkldnn_rnn_layer_weight)); @@ -520,7 +548,8 @@ Tensor mkldnn_reorder_conv3d_weight( IntArrayRef padding, IntArrayRef stride, IntArrayRef dilation, - int64_t groups) { + int64_t groups, + c10::OptionalArrayRef input_size) { TORCH_CHECK(false, "mkldnn_reorder_conv3d_weight: MKL-DNN build is disabled"); } diff --git a/aten/src/ATen/native/mkldnn/Matmul.cpp b/aten/src/ATen/native/mkldnn/Matmul.cpp index a6770cca1d35c..db02e5f3857a6 100644 --- a/aten/src/ATen/native/mkldnn/Matmul.cpp +++ b/aten/src/ATen/native/mkldnn/Matmul.cpp @@ -53,14 +53,38 @@ bool mkldnn_fp16_gemm( c10::Half *c, int64_t ldc) { return false; } +bool mkldnn_bf32_gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const float *a, int64_t lda, + const float *b, int64_t ldb, + float beta, + float *c, int64_t ldc){ + return false; + } + +bool use_mkldnn_bf32_matmul( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& result) { + return false; +} -bool use_mkldnn_lower_precision_matmul( +bool use_mkldnn_matmul( const Tensor& mat1, const Tensor& mat2, const Tensor& result) { return false; } +void mkldnn_matmul_i8i8i32( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result) { + TORCH_INTERNAL_ASSERT(false, __func__, ": ATen not compiled with MKLDNN support"); +} + } // namespace native } // namespace at @@ -80,13 +104,18 @@ static bool use_mkldnn_fp16_matmul() { return at::globalContext().userEnabledMkldnn() && mkldnn_fp16_device_check(); } +static bool use_mkldnn_bf32_matmul() { + return use_mkldnn_bf16_matmul() && at::globalContext().float32MatmulPrecision() == at::Float32MatmulPrecision::MEDIUM; +} + template inline typename std::enable_if_t< + std::is_same_v || std::is_same_v || std::is_same_v, bool> -mkldnn_lowerp_gemm( +mkldnn_gemm( TransposeType transa, TransposeType transb, int64_t m, int64_t n, int64_t k, float alpha, @@ -94,8 +123,10 @@ mkldnn_lowerp_gemm( const scalar_t *b_data, int64_t ldb, float beta, scalar_t *c_data, int64_t ldc) { - if (!(std::is_same_v ? use_mkldnn_bf16_matmul() - : use_mkldnn_fp16_matmul()) || + bool bf16_usable = std::is_same_v && use_mkldnn_bf16_matmul(); + bool fp16_usable = std::is_same_v && use_mkldnn_fp16_matmul(); + bool bf32_usable = std::is_same_v && use_mkldnn_bf32_matmul(); + if ( !(bf16_usable || fp16_usable || bf32_usable) || (m * n * k <= 16 * 16 * 16) || (alpha == 0.0f)) { return false; } @@ -105,6 +136,7 @@ mkldnn_lowerp_gemm( if (beta != 0.0f) { op_attr = ideep::attr_t::fuse_sum(); } + if (std::is_same_v) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path // NOTE: View as c-contiguous to avoid extra reordering in mkldnn // Use identity: C = AB <=> C^T = B^T A^T @@ -117,9 +149,12 @@ mkldnn_lowerp_gemm( } auto idtype = ideep::tensor::data_type::bf16; - if constexpr (!std::is_same_v) { + if constexpr (std::is_same_v) { idtype = ideep::tensor::data_type::f16; } + if constexpr (std::is_same_v) { + idtype = ideep::tensor::data_type::f32; + } ideep::tensor a({ /*sizes=*/{k, m}, @@ -164,7 +199,7 @@ bool mkldnn_bf16_gemm( const c10::BFloat16 *b, int64_t ldb, float beta, c10::BFloat16 *c, int64_t ldc) { - return mkldnn_lowerp_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return mkldnn_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } bool mkldnn_fp16_gemm( @@ -175,9 +210,19 @@ bool mkldnn_fp16_gemm( const c10::Half *b, int64_t ldb, float beta, c10::Half *c, int64_t ldc) { - return mkldnn_lowerp_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + return mkldnn_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); } +bool mkldnn_bf32_gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const float *a, int64_t lda, + const float *b, int64_t ldb, + float beta, + float *c, int64_t ldc){ + return mkldnn_gemm(transa, transb, m, n, k, alpha, a, lda, b, ldb, beta, c, ldc); + } void mkldnn_matmul( const Tensor &mat1, @@ -205,11 +250,12 @@ void mkldnn_matmul( #else TORCH_CHECK( (mat1.scalar_type() == at::kBFloat16 || - mat1.scalar_type() == at::kHalf) && + mat1.scalar_type() == at::kHalf || + mat1.scalar_type() == at::kFloat) && mat2.scalar_type() == mat1.scalar_type() && result.scalar_type() == mat1.scalar_type(), "mkldnn_matmul: only enabled for bf16 and fp16 path"); - if (mat1.scalar_type() == at::kBFloat16) { + if (mat1.scalar_type() == at::kBFloat16 || mat1.scalar_type() == at::kFloat) { TORCH_CHECK( mkldnn_bf16_device_check(), "mkldnn_matmul: mkldnn_matmul bf16 path needs the cpu support avx_ne_convert or avx512bw, avx512vl and avx512dq, or AWS Graviton3"); @@ -230,6 +276,7 @@ void mkldnn_matmul( // but mkldnn matmul primitive only support bias be 1-D tensors // to address their differences, we use mkldnn post ops to perform a fused "add" after matrix multiplication is over if (beta != 0.0f) op_attr = ideep::attr_t::fuse_sum(); + if (mat1.scalar_type() == at::kFloat) op_attr.set_fpmath_mode(dnnl_fpmath_mode_bf16); // bf32 path // If alpha = 0, dose not need actually do gemm computation if (alpha == 0) return; @@ -340,11 +387,129 @@ bool use_mkldnn_fp16_matmul( checksize(mat1, mat2)); } -bool use_mkldnn_lower_precision_matmul( +bool use_mkldnn_bf32_matmul( const Tensor& mat1, const Tensor& mat2, const Tensor& result) { - return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result)); + + return ( + use_mkldnn_bf32_matmul() && + mat1.scalar_type() == kFloat && + mat2.scalar_type() == kFloat && + (!result.defined() || result.scalar_type() == kFloat) && + mat1.numel() != 0 && + mat2.numel() != 0 && + checksize(mat1, mat2)); +} + +bool use_mkldnn_matmul( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& result) { + return (use_mkldnn_bf16_matmul(mat1, mat2, result) || use_mkldnn_fp16_matmul(mat1, mat2, result) || use_mkldnn_bf32_matmul(mat1, mat2, result)); +} + +static void _mkldnn_matmul_i8i8i32_with_primitive( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result) { + // Create ideep tensors for oneDNN computation + auto src = ideep::tensor( + {mat1.sizes().vec(), + ideep::tensor::data_type::s8, + mat1.strides().vec()}, + mat1.data_ptr()); + auto wei = ideep::tensor( + {mat2.sizes().vec(), + ideep::tensor::data_type::s8, + mat2.strides().vec()}, + mat2.data_ptr()); + auto dst = ideep::tensor( + {result.sizes().vec(), + ideep::tensor::data_type::s32, + result.strides().vec()}, + result.data_ptr()); + // Create primitive desc + auto engine = ideep::engine::cpu_engine(); + ideep::attr_t op_attr; + op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + auto src_desc = src.get_desc(); + auto wei_desc = wei.get_desc(); + auto dst_desc = dst.get_desc(); + auto prim_desc = dnnl::matmul::primitive_desc( + engine, src_desc, wei_desc, dst_desc, op_attr); + // Reorder mat2 if needed + auto expected_weight = wei.reorder_if_differ_in(prim_desc.weights_desc()); + // Prepare args for primitive + ideep::tensor scratchpad(prim_desc.scratchpad_desc()); + ideep::exec_args args; + args.insert({DNNL_ARG_SRC, src}); + args.insert({DNNL_ARG_WEIGHTS, expected_weight}); + args.insert({DNNL_ARG_DST, dst}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + // Create primitve and execute + auto primitive = dnnl::matmul(prim_desc); + primitive.execute(ideep::stream::default_stream(), args); +} + +static void _mkldnn_gemm_i8i8i32_with_blas( + const Tensor& self, + const Tensor& mat2, + const Tensor& result) { + const int m = result.size(0); + const int n = result.size(1); + const int k = self.size(1); + + const char transa = self.strides()[1] == 1 ? 'N' : 'T'; + const char transb = mat2.strides()[1] == 1 ? 'N' : 'T'; + const char offsetc = 'F'; + + const int lda = transa == 'T' ? self.stride(1) : self.stride(0); + const int ldb = transb == 'T' ? mat2.stride(1) : mat2.stride(0); + const int ldc = n; + + const float alpha = 1; + const float beta = 0; + + int8_t ao = 0; + int8_t bo = 0; + int32_t co = 0; + + dnnl::gemm_s8s8s32( + transa, + transb, + offsetc, + m, + n, + k, + alpha, + (int8_t*)self.data_ptr(), + lda, + ao, + (int8_t*)mat2.data_ptr(), + ldb, + bo, + beta, + (int32_t*)result.data_ptr(), + ldc, + &co); + } + +void mkldnn_matmul_i8i8i32( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result) { + // x:s8 * w:s8 -> y:s32 + // both inputs should be 2d + // In most cases, using DNNL blas API is faster but it requires a/b contiguous along one dimentsion + bool a_is_contigous = (mat1.stride(0) == 1 || mat1.stride(1) == 1); + bool b_is_contigous = (mat2.stride(0) == 1 || mat2.stride(1) == 1); + + if (a_is_contigous && b_is_contigous) { + _mkldnn_gemm_i8i8i32_with_blas(mat1, mat2, result); + } else { + _mkldnn_matmul_i8i8i32_with_primitive(mat1, mat2, result); + } } } // namespace native diff --git a/aten/src/ATen/native/mkldnn/Matmul.h b/aten/src/ATen/native/mkldnn/Matmul.h index 86452c416953b..d82bb310efeba 100644 --- a/aten/src/ATen/native/mkldnn/Matmul.h +++ b/aten/src/ATen/native/mkldnn/Matmul.h @@ -24,6 +24,11 @@ bool use_mkldnn_fp16_matmul( const Tensor& mat2, const Tensor& result_opt); +bool use_mkldnn_bf32_matmul( + const Tensor& mat1, + const Tensor& mat2, + const Tensor& result_opt); + // Try running mkldnn optimized gemm, or returns false if naive gemm would be faster bool mkldnn_bf16_gemm( TransposeType transa, TransposeType transb, @@ -43,11 +48,31 @@ bool mkldnn_fp16_gemm( float beta, c10::Half *c, int64_t ldc); -bool use_mkldnn_lower_precision_matmul( +/* +oneDNN implicit reduced precision arithmetic feature +https://github.com/mgouicem/oneDNN/tree/mgouicem/rfcs/implicit_downconvert/rfcs/20210301-computation-datatype +to allow implicitly cast data type from FP32 to BF16 in onednn compute primitives +*/ +bool mkldnn_bf32_gemm( + TransposeType transa, TransposeType transb, + int64_t m, int64_t n, int64_t k, + float alpha, + const float *a, int64_t lda, + const float *b, int64_t ldb, + float beta, + float *c, int64_t ldc); + +bool use_mkldnn_matmul( const Tensor& mat1, const Tensor& mat2, const Tensor& result); +// x:s8 * w:s8 -> y:s32 +TORCH_API void mkldnn_matmul_i8i8i32( + const Tensor &mat1, + const Tensor &mat2, + const Tensor &result); + } } diff --git a/aten/src/ATen/native/mkldnn/Normalization.cpp b/aten/src/ATen/native/mkldnn/Normalization.cpp index 108ce354ec9bb..0aced614a0ea3 100644 --- a/aten/src/ATen/native/mkldnn/Normalization.cpp +++ b/aten/src/ATen/native/mkldnn/Normalization.cpp @@ -6,6 +6,8 @@ #ifndef AT_PER_OPERATOR_HEADERS #include #else +#include +#include #include #include #include @@ -59,6 +61,20 @@ std::tuple _mkldnn_batch_norm_legit_no_stats( TORCH_CHECK(false, "_mkldnn_batch_norm_legit_no_stats: ATen not compiled with MKLDNN support"); } +std::tuple _batch_norm_with_update_mkldnn( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps) { + TORCH_CHECK(false, "_batch_norm_with_update_mkldnn: ATen not compiled with MKLDNN support"); +} + +std::tuple _new_batch_norm_backward_mkldnn( + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { + TORCH_CHECK(false, "_new_batch_norm_backward_mkldnn: ATen not compiled with MKLDNN support"); +} + } // namespace native } // namespace at @@ -192,6 +208,17 @@ std::tuple mkldnn_batch_norm( } +std::tuple _batch_norm_with_update_mkldnn( + const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, + Tensor& running_mean, Tensor& running_var, double momentum, double eps) { + Tensor output, save_mean, save_var; + std::tie(output, save_mean, save_var) = + mkldnn_batch_norm(input, weight_opt, bias_opt, running_mean, running_var, /*train*/true, momentum, eps); + Tensor reserve = empty_mkldnn({0}, input.scalar_type()); + return std::tuple(output, save_mean, save_var, reserve); +} + + std::tuple _mkldnn_batch_norm_legit( const Tensor& input, const c10::optional& weight_opt, const c10::optional& bias_opt, Tensor& running_mean, Tensor& running_var, bool train, @@ -210,6 +237,15 @@ std::tuple _mkldnn_batch_norm_legit_no_stats( } +std::tuple _new_batch_norm_backward_mkldnn( + const Tensor& grad_output, const Tensor& input, const Tensor& weight, + const c10::optional& running_mean_opt, const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, const c10::optional& save_var_opt, + bool update, double eps, std::array grad_input_mask, const Tensor& reserve) { + return mkldnn_batch_norm_backward(grad_output, input, weight, running_mean_opt, running_var_opt, save_mean_opt, save_var_opt, update, eps, grad_input_mask); +} + + std::tuple mkldnn_batch_norm_backward(const Tensor& grad_output, const Tensor& input, const c10::optional& weight_opt, const c10::optional& running_mean_opt, const c10::optional& running_var_opt, const c10::optional& save_mean_opt, const c10::optional& save_invstd_opt, bool train, diff --git a/aten/src/ATen/native/mkldnn/Pooling.cpp b/aten/src/ATen/native/mkldnn/Pooling.cpp index 5eb03c43220d0..7b59d7b85fe93 100644 --- a/aten/src/ATen/native/mkldnn/Pooling.cpp +++ b/aten/src/ATen/native/mkldnn/Pooling.cpp @@ -642,7 +642,7 @@ Tensor& mkldnn_avg_pool3d_backward_out(const Tensor & grad_output, Tensor mkldnn_adaptive_avg_pool2d_backward( const Tensor& grad_output, const Tensor& input) { - TORCH_CHECK(input.dim() == 4, "mkldnn_adaptive_avg_pool2d: Input is expected a 4D tenosor"); + TORCH_CHECK(input.dim() == 4, "mkldnn_adaptive_avg_pool2d: Input is expected a 4D tensor"); auto output_size_vec = grad_output.sizes(); std::vector kernel_size(input.dim() - 2); diff --git a/aten/src/ATen/native/mkldnn/RNN.cpp b/aten/src/ATen/native/mkldnn/RNN.cpp index a5effcc0ce158..afea7f91e79ea 100644 --- a/aten/src/ATen/native/mkldnn/RNN.cpp +++ b/aten/src/ATen/native/mkldnn/RNN.cpp @@ -75,7 +75,7 @@ REGISTER_NO_CPU_DISPATCH(lstm_mkldnn_stub); } // namespace at::native -#else // AT_MKLDNN_EBABLED +#else // AT_MKLDNN_ENABLED #include #include @@ -541,8 +541,7 @@ std::pair mkldnn_impl( const Tensor& input, const hidden_type& hidden, TensorList params, bool has_biases, ideep::rnn_kind mode, int64_t num_layers, double dropout_p, bool train, bool bidirectional, bool batch_first) { - Tensor hx, cx; - std::tie(hx, cx) = unpack_hidden(hidden); + auto [hx, cx] = unpack_hidden(hidden); int64_t hidden_size = hx.size(2); auto mkldnn_output = mkldnn_rnn( @@ -569,4 +568,4 @@ REGISTER_ALL_CPU_DISPATCH(lstm_mkldnn_stub, &lstm_mkldnn); } // namespace at::native -#endif // AT_MKLDNN_EBABLED +#endif // AT_MKLDNN_ENABLED diff --git a/aten/src/ATen/native/mkldnn/Utils.h b/aten/src/ATen/native/mkldnn/Utils.h index 6d4a172ebe400..aa804d6bc1877 100644 --- a/aten/src/ATen/native/mkldnn/Utils.h +++ b/aten/src/ATen/native/mkldnn/Utils.h @@ -97,7 +97,7 @@ constexpr bool mkldnn_bf16_device_check_arm() { #if AT_MKLDNN_ENABLED() inline bool mkldnn_bf16_device_check() { -#if defined(__x86_64__) +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) // Use ideep to check bf16 on X64 as cpuinfo has no avx_ne_convert check. return ideep::has_bf16_type_support(); #else @@ -106,7 +106,7 @@ inline bool mkldnn_bf16_device_check() { } inline bool mkldnn_fp16_device_check() { -#if defined(__x86_64__) +#if defined(__x86_64__) || (defined(_M_X64) && !defined(_M_ARM64EC)) return ideep::has_fp16_type_support(); #else return false; diff --git a/aten/src/ATen/native/mkldnn/xpu/Blas.cpp b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp new file mode 100644 index 0000000000000..6cba3f4c9fa18 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/Blas.cpp @@ -0,0 +1,436 @@ +#include +#include +#include +#include + +namespace at::native::xpu { + +// result = beta * self + alpha * (mat1 * mat2) +Tensor& addmm_out( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + at::Tensor& result) { + checkBackend("addmm_out", {result, self, mat1, mat2}, Backend::XPU); + TORCH_CHECK( + mat1.dim() == 2, "mat1 must be a matrix, got ", mat1.dim(), "-D tensor"); + TORCH_CHECK( + mat2.dim() == 2, "mat2 must be a matrix, got ", mat2.dim(), "-D tensor"); + TORCH_CHECK( + mat1.sizes()[1] == mat2.sizes()[0], + "mat1 and mat2 shapes cannot be multiplied (", + mat1.sizes()[0], + "x", + mat1.sizes()[1], + " and ", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ")"); + + std::vector result_shape = {mat1.size(0), mat2.size(1)}; + result.resize_(result_shape); + + IntArrayRef result_sizes = result.sizes(); + if ((result_sizes[0] == 0) || (result_sizes[1] == 0)) { + return result; + } + + if (mat1.numel() == 0){ + if(beta.to() == 0.f){ + return result.zero_(); + } + return at::mul_out( + result, + self.expand(result.sizes()), + at::native::scalar_tensor( + beta, + self.scalar_type(), + c10::nullopt, + at::kCPU, + c10::nullopt + ) + ); + } + + TORCH_CHECK( + are_expandable(self.sizes(), result_shape), + "addmm_out input must be expanable to:", + result_shape, + " but got:", + self.sizes()); + + // complex/double case + if (mat1.is_complex() || mat1.scalar_type() == ScalarType::Double) { + AT_ERROR( + "Double and complex datatype matmul is not supported in oneDNN"); + } + + // general case + Tensor bias = Tensor(); + onednn::Attr attr; + float beta_ = beta.to(); + if (beta_ == 0.f) { + if (alpha.to() != 1.f) { + attr.append_post_eltwise( + 1.f, alpha.to(), 0.f, attr.kind_with_linear); + } + } else { + if (alpha.to() == 1.f && beta_ == 1.f) { + bias = self; + } else { + Tensor binary = self.dim() == 1 ? self.unsqueeze(0) : self; + // Tensor binary = self.expand_as(result); + // For post-binary-add, onednn needs binary scale=1.f + // Thus we need the following transformation + // alpha * matmul(mat1, mat2) + beta * binary + // beta * (alpha/beta * matmul(src, wei) + binary) + float alpha_ = alpha.to() / beta_; + if (alpha_ != 1.f) + attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear); + attr.append_post_binary(attr.kind_with_binary_add, binary); + if (beta_ != 1.f) + attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear); + } + } + onednn::matmul(result, mat1, mat2, bias, true, attr); + return result; +} + +Tensor& _addmm_activation_out( + const Tensor& self, + const Tensor& mat1, + const Tensor& mat2, + const Scalar& beta, + const Scalar& alpha, + bool use_gelu, + at::Tensor& result) { + addmm_out(self, mat1, mat2, beta, alpha, result); + if (use_gelu) { + at::gelu_(result); + } else { + at::relu_(result); + } + return result; +} + +Tensor& mm_out(const Tensor& self, const Tensor& mat2, Tensor& result) { + checkBackend("mm_out", {result, self, mat2}, Backend::XPU); + TORCH_CHECK(self.dim() == 2, "self must be a matrix"); + TORCH_CHECK(mat2.dim() == 2, "mat2 must be a matrix"); + TORCH_CHECK( + self.sizes()[1] == mat2.sizes()[0], + "mat1 and mat2 shapes cannot be multiplied (", + self.sizes()[0], + "x", + self.sizes()[1], + " and ", + mat2.sizes()[0], + "x", + mat2.sizes()[1], + ")"); + + result.resize_({self.size(0), mat2.size(1)}); + if (self.numel() == 0 || mat2.numel() == 0) { + if (result.numel() > 0) + result.zero_(); + return result; + } + + if (self.is_complex() || self.scalar_type() == ScalarType::Double) { + AT_ERROR( + "Double and complex datatype matmul is not supported in oneDNN"); + } + + onednn::matmul(result, self, mat2, Tensor(), true, onednn::Attr()); + return result; +} + +Tensor mm(const Tensor& self, const Tensor& mat2) { + auto result = at::empty({0}, self.options()); + xpu::mm_out(self, mat2, result); + return result; +} + +Tensor mv(const Tensor& self, const Tensor& vec) { + Tensor result = at::empty({self.size(0)}, self.options()); + return at::addmv_(result, self, vec, 0, 1); +} + + +// result = beta * input + alpha * (batch1 @ batch2) +Tensor& baddbmm_out( + const Tensor& input, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha, + Tensor& result) { + checkBackend("baddbmm_out", {input, batch1, batch2}, Backend::XPU); + TORCH_CHECK(batch1.dim() == 3, "expected 3D tensor"); + TORCH_CHECK(batch2.dim() == 3, "expected 3D tensor"); + + std::vector result_shape = { + batch1.size(0), batch1.size(1), batch2.size(2)}; + result.resize_(result_shape); + if (result.numel() == 0){ + return result; + } else if (batch1.size(2) == 0){ + if (beta.to>() == 0.0){ + return result.zero_(); + }else{ + at::mul_out(result, input, beta); + return result; + } + } + + TORCH_CHECK( + are_expandable(input.sizes(), result_shape), + "baddbmm_out input must be expanable to:", + result_shape, + " but got:", + input.sizes()); + + // complex and double case + if (batch1.is_complex() || batch2.scalar_type() == ScalarType::Double) { + AT_ERROR( + "Double and complex datatype matmul is not supported in oneDNN"); + } + + // general case + onednn::Attr attr; + float beta_ = beta.to(); + Tensor binary; + if (beta_ == 0.f) { + if (alpha.to() != 1.f) { + attr.append_post_eltwise( + 1.f, alpha.to(), 0.f, attr.kind_with_linear); + } + } else { + binary = input.dim() < 3 ? input.unsqueeze(0) : input; + binary = binary.dim() < 3 ? binary.unsqueeze_(0) : binary; + float alpha_ = alpha.to() / beta_; + if (alpha_ != 1.f) + attr.append_post_eltwise(1.f, alpha_, 0.f, attr.kind_with_linear); + attr.append_post_binary(attr.kind_with_binary_add, binary); + if (beta_ != 1.f) + attr.append_post_eltwise(1.f, beta_, 0.f, attr.kind_with_linear); + } + onednn::matmul(result, batch1, batch2, at::Tensor(), true, attr); + return result; +} + +Tensor& baddbmm_( + Tensor& self, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha) { + TORCH_CHECK(self.dtype() == batch1.dtype(), "Input dtypes must be the same, got: input ", self.dtype(), ", batch1: ", batch1.dtype(), ", batch2: ", batch2.dtype()); + return at::native::xpu::baddbmm_out( + self, batch1, batch2, beta, alpha, self); +} + +Tensor baddbmm( + const Tensor& input, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha) { + Tensor r = at::empty({0}, input.options()); + TORCH_CHECK(input.dtype() == batch1.dtype(), "Input dtypes must be the same, got: input ", input.dtype(), ", batch1: ", batch1.dtype(), ", batch2: ", batch2.dtype()); + r = at::native::xpu::baddbmm_out(input, batch1, batch2, beta, alpha, r); + return r; +} + +Tensor& addbmm_out( + const Tensor& self, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha, + Tensor& out) { + checkBackend("addbmm_out", {out, self, batch1, batch2}, Backend::XPU); + TORCH_CHECK( + batch1.dim() == 3 && batch2.dim() == 3, + "Batch tensors should be 3D, got dimensions ", + batch1.dim(), + " and ", + batch2.dim()); + + out.resize_({batch1.size(1), batch2.size(2)}); + if (alpha.to() == 0.f || batch1.numel() == 0 || batch2.numel() == 0) { + out.resize_({batch1.size(1), batch2.size(2)}); + if (out.numel() == 0) + return out; + + if (self.defined() && beta.to() != 0.f) { + out = at::mul_out( + out, self, at::native::wrapped_scalar_tensor(at::Scalar(beta))); + } else { + out.zero_(); + } + return out; + } + + Tensor b1; + if (batch1.size(0) > 1) { + b1 = batch1.transpose(0, 1).contiguous().view({batch1.size(1), -1}); + } else { + b1 = batch1.contiguous().view({batch1.size(1), -1}); + } + auto b2 = batch2.contiguous().view({-1, batch2.size(2)}); + at::native::xpu::addmm_out(self, b1, b2, beta, alpha, out); + + return out; +} + +Tensor& addbmm_( + Tensor& self, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha) { + at::native::xpu::addbmm_out(self, batch1, batch2, beta, alpha, self); + return self; +} + +Tensor addbmm( + const Tensor& self, + const Tensor& batch1, + const Tensor& batch2, + const Scalar& beta, + const Scalar& alpha) { + Tensor out = at::empty({0}, self.options()); + at::native::xpu::addbmm_out(self, batch1, batch2, beta, alpha, out); + return out; +} + +Tensor& bmm_out(const Tensor& self, const Tensor& batch2, Tensor& result) { + checkBackend("bmm_out", {result, self, batch2}, Backend::XPU); + TORCH_CHECK(self.dim() == 3, "expected 3D tensor"); + TORCH_CHECK(batch2.dim() == 3, "expected 3D tensor"); + + result.resize_({self.size(0), self.size(1), batch2.size(2)}); + if (self.numel() == 0 || batch2.numel() == 0) { + if (result.numel() > 0) + result.zero_(); + return result; + } + + if (self.is_complex() || self.scalar_type() == ScalarType::Double) { + AT_ERROR( + "Double and complex datatype matmul is not supported in oneDNN"); + } + onednn::matmul(result, self, batch2, at::Tensor(), true, onednn::Attr()); + return result; +} + +Tensor bmm(const Tensor& self, const Tensor& batch2) { + auto result = at::empty({0}, self.options()); + at::native::xpu::bmm_out(self, batch2, result); + return result; +} + +Tensor& addmv_out( + const Tensor& self, + const Tensor& mat, + const Tensor& vec, + const Scalar& beta, + const Scalar& alpha, + Tensor& out) { + Tensor self_v; + TORCH_CHECK( + (mat.dim() == 2 && vec.dim() == 1 && self.dim() <= 1), + "vector + matrix @ vector expected, got ", + self.dim(), + ", ", + mat.dim(), + ", ", + vec.dim()); + if (self.dim() == 1 && self.size(0) != 1) { + TORCH_CHECK( + (mat.size(1) == vec.size(0) && mat.size(0) == self.size(0)), + "size mismatch, get ", + self.size(0), + ", ", + mat.size(0), + "x", + mat.size(1), + ",", + vec.size(0)); + self_v = self.view({self.size(0), 1}); + } else { + TORCH_CHECK( + (mat.size(1) == vec.size(0)), + "size mismatch, get ", + mat.size(0), + "x", + mat.size(1), + ",", + vec.size(0)); + self_v = self; + } + + Tensor vec_v = vec.view({vec.size(0), 1}); + at::native::xpu::addmm_out(self_v, mat, vec_v, beta, alpha, out); + out.resize_({mat.size(0)}); + return out; +} + +Tensor& tensordot_out( + const Tensor& input1, + const Tensor& input2, + IntArrayRef dims1, + IntArrayRef dims2, + Tensor& result) { + Tensor result_tmp = at::tensordot(input1, input2, dims1, dims2); + auto result_dtype = result_tmp.scalar_type(); + auto output_tensor_dtype = result.scalar_type(); + auto output_device = result.device(); + auto input1_device = input1.device(); + auto input2_device = input2.device(); + // check if the input & output tensors are on the same device. + TORCH_CHECK( + (output_device == input1_device) && (input1_device == input2_device), + "tensordot: Expected the output and input tensors to be on the " + "same device, but got the output tensor on ", + output_device, + ", input tensor a on ", + input1_device, + ", and input tensor b on ", + input2_device); + // check if the computed result has the same dtype as the out tensor + // (because tensordot does not support type promotion) + TORCH_CHECK( + result_dtype == output_tensor_dtype, + "tensordot", + ": Expected the output tensor to have dtype ", + result_dtype, + ", but got an output tensor with dtype ", + output_tensor_dtype); + at::native::resize_output(result, result_tmp.sizes()); + result.copy_(result_tmp); + return result; +} + +TORCH_LIBRARY_IMPL(aten, XPU, m){ + m.impl("addmm.out", TORCH_FN(addmm_out)); + m.impl("_addmm_activation.out", TORCH_FN(_addmm_activation_out)); + m.impl("mm.out", TORCH_FN(mm_out)); + m.impl("mm", TORCH_FN(mm)); + m.impl("baddbmm.out", TORCH_FN(baddbmm_out)); + m.impl("baddbmm_", TORCH_FN(baddbmm_)); + m.impl("baddbmm", TORCH_FN(baddbmm)); + m.impl("addbmm.out", TORCH_FN(addbmm_out)); + m.impl("addbmm_", TORCH_FN(addbmm_)); + m.impl("addbmm", TORCH_FN(addbmm)); + m.impl("bmm.out", TORCH_FN(bmm_out)); + m.impl("bmm", TORCH_FN(bmm)); + m.impl("addmv.out", TORCH_FN(addmv_out)); + m.impl("tensordot.out", TORCH_FN(tensordot_out)); +} + +} // namespace at::native::xpu diff --git a/aten/src/ATen/native/mkldnn/xpu/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp new file mode 100644 index 0000000000000..8ac19605b1c79 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/Conv.cpp @@ -0,0 +1,739 @@ +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace dnnl; +using namespace at::native; +using namespace at::native::onednn; + +namespace at::native { +namespace xpu { +namespace impl { + +struct ConvParams { + std::vector stride; + std::vector padding; + std::vector dilation; + bool transposed; + std::vector output_padding; + int groups; + bool benchmark; + bool deterministic; + + bool is_strided() const; + bool is_dilated() const; + bool is_padded() const; + bool is_output_padding_neg() const; + bool is_output_padding_big() const; + bool is_padding_neg() const; + bool is_stride_nonpos() const; + void view1d_as_2d(); + bool use_cpu_depthwise3x3_winograd( + const at::Tensor& input, + const at::Tensor& weight) const; + bool is_depthwise(const at::Tensor& input, const at::Tensor& weight) const; +}; + +std::ostream& operator<<(std::ostream& out, const ConvParams& params) { + out << "ConvParams {" + << " stride = " << IntArrayRef{params.stride} + << " padding = " << IntArrayRef{params.padding} + << " dilation = " << IntArrayRef{params.dilation} + << " transposed = " << params.transposed + << " output_padding = " << IntArrayRef{params.output_padding} + << " groups = " << params.groups << " benchmark = " << params.benchmark + << " deterministic = " << params.deterministic << "}"; + return out; +} + +bool ConvParams::is_strided() const { + bool is_strided = false; + for (int s : stride) { + is_strided |= (s != 1); + } + return is_strided; +} + +bool ConvParams::is_dilated() const { + bool is_dilated = false; + for (int d : dilation) { + is_dilated |= (d != 1); + } + return is_dilated; +} + +bool ConvParams::is_padded() const { + bool is_padded = false; + for (int p : padding) { + is_padded |= (p != 0); + } + return is_padded; +} + +bool ConvParams::is_output_padding_neg() const { + bool is_non_neg = false; + for (int p : output_padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; +} + +bool ConvParams::is_output_padding_big() const { + bool is_big = false; + for (size_t i = 0; i < output_padding.size(); i++) { + is_big |= + (output_padding[i] >= stride[i] || output_padding[i] >= dilation[i]); + } + return is_big; +} + +bool ConvParams::is_padding_neg() const { + bool is_non_neg = false; + for (int p : padding) { + is_non_neg |= (p < 0); + } + return is_non_neg; +} + +bool ConvParams::is_stride_nonpos() const { + bool is_nonpos = false; + for (int s : stride) { + is_nonpos |= (s <= 0); + } + return is_nonpos; +} + +void ConvParams::view1d_as_2d() { + if (stride.size() == 1) { + stride.insert(stride.begin(), 1); + padding.insert(padding.begin(), 0); + dilation.insert(dilation.begin(), 1); + output_padding.insert(output_padding.begin(), 0); + } +} + +bool ConvParams::use_cpu_depthwise3x3_winograd( + const at::Tensor& input, + const at::Tensor& weight) const { + return false; +} + +bool ConvParams::is_depthwise(const at::Tensor& input, const at::Tensor& weight) + const { + return !transposed && input.ndimension() == 4 && input.size(1) == groups && + groups > 1 && // no point if there is only a single group + weight.size(0) % input.size(1) == + 0; // output channels must be a multiple of input channels +} + +static void check_shape_forward( + const at::Tensor& input, + const at::Tensor& weight, + const at::Tensor& bias, + const ConvParams& params, + bool input_is_mkldnn) { + int64_t k = input.ndimension(); + int64_t weight_dim = weight.ndimension(); + std::vector weight_sizes(weight_dim); + if ((weight_dim == k + 1) && input_is_mkldnn) { + weight_sizes[0] = weight.size(0) * weight.size(1); + std::copy_n(weight.sizes().cbegin() + 2, k - 1, weight_sizes.begin() + 1); + weight_dim = k; + } else { + std::copy_n(weight.sizes().cbegin(), weight_dim, weight_sizes.begin()); + } + int64_t groups = params.groups; + auto padding = params.padding; + auto output_padding = params.output_padding; + auto stride = params.stride; + auto dilation = params.dilation; + bool transposed = params.transposed; + + TORCH_CHECK(!params.is_padding_neg(), "negative padding is not supported"); + TORCH_CHECK( + !params.is_output_padding_neg(), + "negative output_padding is not supported"); + TORCH_CHECK( + !params.is_stride_nonpos(), "non-positive stride is not supported"); + + TORCH_CHECK( + weight_dim == k, + "Expected ", + weight_dim, + "-dimensional input for ", + weight_dim, + "-dimensional weight ", + weight_sizes, + ", but got ", + k, + "-dimensional input of size ", + input.sizes(), + " instead"); + TORCH_CHECK( + weight_sizes[0] >= groups, + "Given groups=", + groups, + ", expected weight to be at least ", + groups, + " at dimension 0, but got weight of size ", + weight_sizes, + " instead"); + TORCH_CHECK( + weight_sizes[0] % groups == 0, + "Given groups=", + groups, + ", expected weight to be divisible by ", + groups, + " at dimension 0, but got weight of size ", + weight_sizes, + " instead"); + + if (!transposed) { + std::vector input_shape; + std::vector kernel_shape; + bool kernel_size_correct = true; + + TORCH_CHECK( + input.size(1) == (weight_sizes[1] * groups), + "Given groups=", + groups, + ", weight of size ", + weight_sizes, + ", expected input", + input.sizes(), + " to have ", + (weight_sizes[1] * groups), + " channels, but got ", + input.size(1), + " channels instead"); + TORCH_CHECK( + !bias.defined() || + (bias.ndimension() == 1 && bias.size(0) == weight_sizes[0]), + "Given weight of size ", + weight_sizes, + ", expected bias to be 1-dimensional with ", + weight_sizes[0], + " elements", + ", but got bias of size ", + bias.sizes(), + " instead"); + + for (int i = 2; i < k; ++i) { + input_shape.push_back(input.size(i) + 2 * padding[i - 2]); + kernel_shape.push_back(dilation[i - 2] * (weight_sizes[i] - 1) + 1); + if (input_shape.back() < kernel_shape.back()) { + kernel_size_correct = false; + } + } + + TORCH_CHECK( + input_shape.size() == kernel_shape.size(), + "Inconsistent shape between Input and Kernel"); + + if (!kernel_size_correct) { + std::ostringstream input_ss; + std::ostringstream kernel_ss; + std::ostringstream output_ss; + std::string separator = ""; + + for (int i = 0, len = input_shape.size(); i < len; ++i) { + input_ss << separator << input_shape[i]; + kernel_ss << separator << kernel_shape[i]; + separator = " x "; + } + + TORCH_CHECK( + 0, + "Calculated padded input size per channel: (", + input_ss.str(), + "). " + "Kernel size: (", + kernel_ss.str(), + "). Kernel size can't be greater than actual input size"); + } + } else { + TORCH_CHECK( + input.size(1) == weight_sizes[0], + "Given transposed=", + transposed, + ", weight of size ", + weight_sizes, + ", expected input", + input.sizes(), + " to have ", + weight_sizes[0], + " channels, but got ", + input.size(1), + " channels instead"); + TORCH_CHECK( + !bias.defined() || + (bias.ndimension() == 1 && + bias.size(0) == weight_sizes[1] * groups), + "Given transposed=", + transposed, + ", weight of size ", + weight_sizes, + ", expected bias to be 1-dimensional with ", + weight_sizes[1] * groups, + " elements", + ", but got bias of size ", + bias.sizes(), + " instead"); + } +} + +static at::Tensor view4d(const at::Tensor& tensor) { + TORCH_CHECK( + tensor.ndimension() == 3, + "expected 3D tensor, got tensor with ", + tensor.ndimension(), + " dimensions instead"); + return tensor.unsqueeze(2); +} + +static at::Tensor view3d(const at::Tensor& tensor) { + TORCH_CHECK( + tensor.ndimension() == 4, + "expected 4D tensor, got tensor with ", + tensor.ndimension(), + " dimensions instead"); + return tensor.squeeze(2); +} + +Attr get_onednn_conv_sum_attr( + const Tensor& input_r, + const Tensor& weight_r, + IntArrayRef stride_, + IntArrayRef padding_, + IntArrayRef dilation_, + Tensor& accumu, + double scale, + Tensor& output, + bool& is_fused, + Attr attr = Attr(), + bool force_inplace = false) { + is_fused = true; + if (scale == 0.f) + return attr; + + auto ndim = input_r.ndimension(); + auto output_size = conv_dst_size( + ndim, + input_r.sizes(), + weight_r.sizes(), + padding_, + padding_, + stride_, + dilation_); + MemoryFormat mem_fmt = at::MemoryFormat::Contiguous; + auto input_fmt = input_r.suggest_memory_format(); + auto input_is_cl = (input_fmt == at::MemoryFormat::ChannelsLast || input_fmt == at::MemoryFormat::ChannelsLast3d); + auto weight_fmt = weight_r.suggest_memory_format(); + auto weight_is_cl = (weight_fmt == at::MemoryFormat::ChannelsLast || weight_fmt == at::MemoryFormat::ChannelsLast3d); + + bool propagate_channels_last = input_is_cl || weight_is_cl; + if (propagate_channels_last) + mem_fmt = get_cl_tag_by_ndim(ndim); + + Tensor out = at::empty(output_size, input_r.options().memory_format(mem_fmt)); + if (!onednn::binary_valid(out, accumu)) { + is_fused = false; + return attr; + } + + // For post-sum and post-binary-add, onednn needs sum/binary scale=1.f + // Thus we need the following transformation + // conv(src, wei) + scale * accumu + // scale * (1/scale * conv(src, wei) + sum (or binary)) + if (scale != 1.f) + attr.append_post_eltwise( + /* scale */ 1.f, + /* alpha */ 1.f / scale, + /* beta */ 0.f, + attr.kind_with_linear); + + if (force_inplace) { + // If sizes are the same, post sum is used. + output = accumu; + attr.append_post_sum(/* sum_scale */ 1.f); + } else { + // If sizes are different, post binary is used. + attr.append_post_binary(attr.kind_with_binary_add, accumu); + } + + if (scale != 1.f) + attr.append_post_eltwise( + /* scale */ 1.f, + /* alpha */ scale, + /* beta */ 0.f, + attr.kind_with_linear); + + return attr; +} + +} // namespace impl + +using namespace impl; + +Tensor _convolution_out( + Tensor& output_r, + const Tensor& input_r, + const Tensor& weight_r, + const Tensor& bias_r, + IntArrayRef stride_, + IntArrayRef padding_, + IntArrayRef dilation_, + bool transposed_, + IntArrayRef output_padding_, + int64_t groups_, + Attr attr, + IntArrayRef pad_nd = IntArrayRef({})) { + auto ndim = input_r.ndimension(); + TORCH_CHECK( + 3 == ndim || 4 == ndim || 5 == ndim, + "convolution only supports 3D, 4D, 5D tensor"); + // get computation format for Conv/TransposedConv + bool is_channels_last_suggested = use_channels_last_for_conv(input_r, weight_r, transposed_); + + Tensor input = input_r, weight = weight_r; + // PyTorch does not support ChannelsLast1D case, + // thus we need the transformation here + if (ndim == 3) { + input = view4d(input_r); + weight = view4d(weight_r); + } + // ensure the input/weight/bias/output are congituous in desired format + at::MemoryFormat mfmt = is_channels_last_suggested + ? get_cl_tag_by_ndim(input.ndimension()) + : at::MemoryFormat::Contiguous; + auto bias = bias_r.defined() ? bias_r.contiguous() : bias_r; + input = input.contiguous(mfmt); + weight = weight.contiguous(mfmt); + + auto k = weight.ndimension(); + if (k == input.ndimension() + 1) { + k = input.ndimension(); + } + int64_t dim = k - 2; + TORCH_CHECK(dim > 0, "weight should have at least three dimensions"); + + ConvParams params; + if (ndim == 3) { + // PyTorch does not support ChannelsLast1D case, + // thus we need the transformation here + params.stride = stride_.vec(); + params.padding = padding_.vec(); + params.dilation = dilation_.vec(); + params.transposed = transposed_; + params.output_padding = output_padding_.vec(); + params.groups = groups_; + params.view1d_as_2d(); + } else { + params.stride = expand_param_if_needed(stride_, "stride", dim); + // PyTorch default Conv padding should be a single integer value + // or a list of values to match the conv dimensions + // conv2d, the number of padding values should be 1 or 2 + // conv3d, the number of padding values should be 1 or 3 + // the padding value will be padded into both side of Conv input (D, H, W) + params.padding = expand_param_if_needed(padding_, "padding", dim); + params.dilation = expand_param_if_needed(dilation_, "dilation", dim); + params.transposed = transposed_; + params.output_padding = + expand_param_if_needed(output_padding_, "output_padding", dim); + params.groups = groups_; + } + check_shape_forward(input, weight, bias, params, true); + + Tensor output; + if (transposed_) { + // create output and propagate memory format + if (!output_r.defined()) { + auto dst_tz = deconv_dst_size( + input.sizes(), + weight.sizes(), + params.padding, + params.stride, + params.dilation, + params.output_padding, + params.groups); + output = at::empty(dst_tz, input.options(), mfmt); + } + + onednn::deconvolution( + output, + input, + weight, + bias, + params.stride, + params.padding, + params.output_padding, + params.dilation, + params.groups, + attr); + } else { + // oneDNN supports padding the two sides of src with different values + // the padding order should be front_top_left and back_bottom_right + auto padding_front_top_left = params.padding; + auto padding_back_bottom_right = params.padding; + + // PyTorch constant_pad_nd: + // can pad different value to the two sides of Conv input (W, H, D) + // (padding_left, padding_right, + // padding_top, padding_bottom, + // padding_front, padding_back) + if (pad_nd.vec().size() > 0) { + for (int i = 0; i < dim; ++i) { + padding_front_top_left[i] += pad_nd[2 * dim - 2 * i - 2]; // 4, 2, 0 + padding_back_bottom_right[i] += pad_nd[2 * dim - 2 * i - 1]; // 5, 3, 1 + } + } + + // create output and propagate memory format + if (! output_r.defined()) { + auto dst_tz = conv_dst_size( + input.ndimension(), + input.sizes(), + weight.sizes(), + padding_front_top_left, + padding_back_bottom_right, + params.stride, + params.dilation); + output = at::empty(dst_tz, input.options(), mfmt); + } + onednn::convolution( + output, + input, + weight, + bias, + padding_front_top_left, + padding_back_bottom_right, + params.stride, + params.dilation, + params.groups, + attr); + } + + if (ndim == 3) { + output = view3d(output); + } + if (output_r.defined() && !output_r.is_same(output)) { + output_r.copy_(output); + } else { + output_r = output; + } + return output_r; +} + +Tensor _convolution( + const Tensor& input_r, + const Tensor& weight_r, + const Tensor& bias_r, + IntArrayRef stride_, + IntArrayRef padding_, + IntArrayRef dilation_, + bool transposed_, + IntArrayRef output_padding_, + int64_t groups_, + Attr attr) { + Tensor output_r; + return _convolution_out( + output_r, + input_r, + weight_r, + bias_r, + stride_, + padding_, + dilation_, + transposed_, + output_padding_, + groups_, + attr); +} + +Tensor convolution_overrideable( + const Tensor& input_r, + const Tensor& weight_r, + const c10::optional& bias_r_opt, + IntArrayRef stride_, + IntArrayRef padding_, + IntArrayRef dilation_, + bool transposed_, + IntArrayRef output_padding_, + int64_t groups_) { + c10::MaybeOwned bias_r_maybe_owned = + at::borrow_from_optional_tensor(bias_r_opt); + const Tensor& bias_r = *bias_r_maybe_owned; + + auto k = weight_r.ndimension(); + at::MemoryFormat backend_memory_format = at::MemoryFormat::Contiguous; + if (xpu_conv_use_channels_last(input_r, weight_r)) { + backend_memory_format = (k == 5) ? at::MemoryFormat::ChannelsLast3d : at::MemoryFormat::ChannelsLast; + } + Tensor input_c = input_r.contiguous(backend_memory_format); + Tensor weight_c = weight_r.contiguous(backend_memory_format); + + return _convolution( + input_c, + weight_c, + bias_r, + stride_, + padding_, + dilation_, + transposed_, + output_padding_, + groups_, + Attr()); +} + +std::tuple convolution_backward_overrideable( + const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + bool transposed, + IntArrayRef output_padding, + int64_t groups, + std::array output_mask) { + auto ndim = input.ndimension(); + TORCH_CHECK( + 3 == ndim || 4 == ndim || 5 == ndim, + "convolution bwd only supports 3D, 4D, 5D tensor"); + TORCH_CHECK( + grad_output.scalar_type() == ScalarType::Float || + grad_output.scalar_type() == ScalarType::BFloat16 || + grad_output.scalar_type() == ScalarType::Double || + grad_output.scalar_type() == ScalarType::Half, + "so far only support float, bfloat16, half and double convolution backward in XPU backend, your data type is ", + grad_output.scalar_type()); + + bool is_channels_last_suggested = use_channels_last_for_conv(input, weight, transposed); + + Tensor grad_output_, input_, weight_; + IntArrayRef stride_, padding_, dilation_, output_padding_; + bool transposed_; + int64_t groups_; + ConvParams params; + if (3 == ndim) { + grad_output_ = view4d(grad_output); + input_ = view4d(input); + weight_ = view4d(weight); + params.stride = stride.vec(); + params.padding = padding.vec(); + params.dilation = dilation.vec(); + params.transposed = transposed; + params.output_padding = output_padding.vec(); + params.groups = groups; + params.view1d_as_2d(); + stride_ = params.stride; + padding_ = params.padding; + dilation_ = params.dilation; + transposed_ = params.transposed; + output_padding_ = params.output_padding; + groups_ = params.groups; + } else { + grad_output_ = grad_output; + input_ = input; + weight_ = weight; + stride_ = stride; + padding_ = padding; + dilation_ = dilation; + transposed_ = transposed; + output_padding_ = output_padding; + groups_ = groups; + } + + // ensure the tensors are contiguous + auto mfmt = is_channels_last_suggested ? get_cl_tag_by_ndim(input_.ndimension()) + : at::MemoryFormat::Contiguous; + grad_output_ = grad_output_.contiguous(mfmt); + weight_ = weight_.contiguous(mfmt); + input_ = input_.contiguous(mfmt); + + auto opt = grad_output_.options(); + Tensor grad_input = at::empty(input_.sizes(), opt, mfmt); + Tensor grad_weight = at::empty(weight_.sizes(), opt, mfmt); + Tensor grad_bias; + if (output_mask[2]) + grad_bias = at::empty({grad_output_.size(1)}, opt); + + if (output_mask[0]) { + if (input.numel() > 0) { + if (transposed_) { + onednn::deconvolution_backward_data( + grad_input, + grad_output_, + weight_, + stride_, + padding_, + dilation_, + groups_, + output_mask[2]); + } else { + onednn::convolution_backward_data( + grad_input, + grad_output_, + weight_, + padding_, + padding_, + stride_, + dilation_, + groups_, + output_mask[2]); + } + } + } + if (output_mask[1] || output_mask[2]) { + if (input.numel() > 0) { + if (transposed_) { + onednn::deconvolution_backward_weights( + grad_weight, + grad_bias, + grad_output_, + input_, + stride_, + padding_, + dilation_, + groups_); + } else { + onednn::convolution_backward_weights( + grad_weight, + grad_bias, + grad_output_, + input_, + weight_.sizes(), + padding_, + padding_, + stride_, + dilation_, + groups_); + } + } + } + + if (3 == ndim) { + if (output_mask[0]) + grad_input = view3d(grad_input); + grad_weight = view3d(grad_weight); + } + return std::tuple{grad_input, grad_weight, grad_bias}; +} + +TORCH_LIBRARY_IMPL(aten, XPU, m){ + m.impl("convolution_overrideable", TORCH_FN(convolution_overrideable)); + m.impl("convolution_backward_overrideable", TORCH_FN(convolution_backward_overrideable)); +} + +} // namespace xpu +} // namespace at::native diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h new file mode 100644 index 0000000000000..56e587084959d --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Attr.h @@ -0,0 +1,365 @@ +#pragma once + +#include +#include +#include +#include +#include + +namespace at::native::onednn { +/* oneDNN quantization usage: + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_quantization.html# + + src_fp32 = scale_src * (src_int8 - zero_point) + wei_fp32 = scale_wei * (wei_int8 - zero_point) + dst_fp32 = scale_dst * (dst_int8 - zero_point) + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + Int8 Convolution: dst_int8 = 1 / scale_dst * dst_fp32; + + Considering zero-point (asymmetric): + dst_fp32 = (src_int8 - src_zp) * src_sc * wei_int8 * wei_sc + dst_sc * (dst_int8 - dst_zp) = (src_int8 - src_zp) * wei_int8 * src_sc * + wei_sc + dst_int8 = (src_int8 - src_zp) * wei_int8 * src_sc * wei_sc / dst_sc + + dst_zp + + considering bias: + fp32 Convolution: dst_fp32 = src_fp32 * wei_fp32 + bias + Int8 Convolution: dst_fp32 = (src_int8 * wei_int8) * (scale_src * scale_wei) + + bias Int8 Convolution: dst_fp32 = (src_int8 * wei_int8 + bias/(scale_src * + scale_wei)) * (scale_src * scale_wei) Int8 Convolution: dst_int8 = 1 / + scale_dst * dst_fp32; +*/ + +/* + oneDNN postops usage: + Currently, oneDNN supports 5 kinds of post ops. More details can be refered +to oneDNN doc. + https://oneapi-src.github.io/oneDNN/dev_guide_attributes_post_ops.html#doxid-dev-guide-attributes-post-ops-1dev-guide-attributes-post-ops-eltwise + +0. without post ops + dst = Conv(src, wei) + bias; + dst_int8 = 1/q_scale * dst; q_scale is the op output quantization scale + fp32 API: Attr attr; + int8 API: Attr attr(q_scale); + +1. append eltwise post op + dst = elt_scale * Eltwise{conv_scale * [Conv(src, wei) + bias], alpha, beta} + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_eltwise(elt_scale, alpha, beta, eltwise_algorithm) + +2. append sum post op + dst = conv_scale * Conv(src, wei) + sum_scale * (dst - zp) + dst_int8 = 1/q_scale * dst; + fp32 API: + Attr attr; + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + int8 API: + Attr attr(q_scale); + attr.append_post_eltwise(1.f, conv_scale, 0.f, kind_with_linear) + attr.append_post_sum(sum_scale) + +3. append binary post op + dst = Binary[Conv(src, wei)] + +*/ +using kind_t = dnnl::primitive::kind; +struct PostOpParam { + // eltwise post op constructor + PostOpParam(float scale, float alpha, float beta, dnnl::algorithm algo, kind_t kind) + : scale_(scale), alpha_(alpha), beta_(beta), algo_(algo), kind_(kind) {} + // sum post op constructor + PostOpParam(float scale, kind_t kind) : scale_(scale), kind_(kind) {} + // binary post op constructor + PostOpParam( + at::Tensor& binary, + dnnl::memory::desc& binary_md, + dnnl::memory::desc& expected_md, + dnnl::algorithm algo, + kind_t kind) + : binary_(binary), + meta_(binary_md), + expected_meta_(expected_md), + algo_(algo), + kind_(kind) {} + // prelu post op constructor + PostOpParam(int mask, kind_t kind) : mask_(mask), kind_(kind) {} + + // post sum or binary with scale post op constructor + PostOpParam(at::Tensor& binary, float scale, dnnl::algorithm algo, kind_t kind) + : scale_(scale), binary_(binary), algo_(algo), kind_(kind) {} + + // for int8 sum/eltwise + float scale_ = 1.0; + // for eltwise + float alpha_ = 0.0; + float beta_ = 0.0; + // for binary + at::Tensor binary_ = at::Tensor(); + at::Tensor expected_binary_ = at::Tensor(); + void* binary_ptr_ = nullptr; + dnnl::memory::desc meta_ = dnnl::memory::desc(); + dnnl::memory::desc expected_meta_ = dnnl::memory::desc(); + // for prelu + int mask_ = 0; + // common + dnnl::algorithm algo_ = dnnl::algorithm::eltwise_relu; + kind_t kind_ = kind_t::eltwise; +}; + +class Attr { + public: + Attr() : q_scale_(1.f), q_zero_point_(0) {} + Attr(float q_scale, int64_t zp = 0) : q_scale_(q_scale), q_zero_point_(zp) {} + + /***** eltwise *****/ + dnnl::algorithm kind_with_relu = dnnl::algorithm::eltwise_relu; + dnnl::algorithm kind_with_sigmoid = dnnl::algorithm::eltwise_logistic; + dnnl::algorithm kind_with_gelu_tanh = dnnl::algorithm::eltwise_gelu_tanh; + dnnl::algorithm kind_with_gelu_erf = dnnl::algorithm::eltwise_gelu_erf; + dnnl::algorithm kind_with_mish = dnnl::algorithm::eltwise_mish; + dnnl::algorithm kind_with_linear = dnnl::algorithm::eltwise_linear; + dnnl::algorithm kind_with_swish = dnnl::algorithm::eltwise_swish; + dnnl::algorithm kind_with_sqrt = dnnl::algorithm::eltwise_sqrt; + dnnl::algorithm kind_with_tanh = dnnl::algorithm::eltwise_tanh; + dnnl::algorithm kind_with_square = dnnl::algorithm::eltwise_square; + dnnl::algorithm kind_with_abs = dnnl::algorithm::eltwise_abs; + dnnl::algorithm kind_with_exp = dnnl::algorithm::eltwise_exp; + dnnl::algorithm kind_with_log = dnnl::algorithm::eltwise_log; + dnnl::algorithm kind_with_round = dnnl::algorithm::eltwise_round; + dnnl::algorithm kind_with_hardswish = dnnl::algorithm::eltwise_hardswish; + dnnl::algorithm kind_with_soft_relu = dnnl::algorithm::eltwise_soft_relu; + dnnl::algorithm kind_with_elu = dnnl::algorithm::eltwise_elu; + dnnl::algorithm kind_with_pow = dnnl::algorithm::eltwise_pow; + dnnl::algorithm kind_with_clip = dnnl::algorithm::eltwise_clip; + // note: hardsigmoid seems oneDNN still not support + dnnl::algorithm kind_with_hardsigmoid = dnnl::algorithm::eltwise_hardsigmoid; + + /***** binary *****/ + dnnl::algorithm kind_with_binary_mul = dnnl::algorithm::binary_mul; + dnnl::algorithm kind_with_binary_add = dnnl::algorithm::binary_add; + dnnl::algorithm kind_with_binary_sub = dnnl::algorithm::binary_sub; + dnnl::algorithm kind_with_binary_div = dnnl::algorithm::binary_div; + dnnl::algorithm kind_with_binary_eq = dnnl::algorithm::binary_eq; + dnnl::algorithm kind_with_binary_ne = dnnl::algorithm::binary_ne; + dnnl::algorithm kind_with_binary_ge = dnnl::algorithm::binary_ge; + dnnl::algorithm kind_with_binary_gt = dnnl::algorithm::binary_gt; + dnnl::algorithm kind_with_binary_le = dnnl::algorithm::binary_le; + dnnl::algorithm kind_with_binary_lt = dnnl::algorithm::binary_lt; + dnnl::algorithm kind_with_binary_max = dnnl::algorithm::binary_max; + dnnl::algorithm kind_with_binary_min = dnnl::algorithm::binary_min; + + // append sum post op + Attr& append_post_sum( + float sum_scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back( + PostOpParam(/*scale_sum*/ sum_scale * sum_q_scale, kind_t::sum)); + return *this; + } + + // append eltwise post op + Attr& append_post_eltwise( + float scale, + float alpha, + float beta, + dnnl::algorithm algo) { + ops_params_.push_back( + PostOpParam(scale, alpha, beta, algo, kind_t::eltwise)); + return *this; + } + + // append binary post op + Attr& append_post_binary(dnnl::algorithm algo, const at::Tensor& binary) { + auto binary_ = binary.is_quantized() ? at::dequantize(binary) : binary; + bool binary_is_channels_last = (binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast || + binary_.suggest_memory_format() == at::MemoryFormat::ChannelsLast3d); + + binary_ = binary_is_channels_last ? binary_ : binary_.contiguous(); + dnnl::memory::desc md = get_onednn_md(binary_); + auto expected_md = dnnl::memory::desc( + md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::any); + ops_params_.push_back( + PostOpParam(binary_, md, expected_md, algo, kind_t::binary)); + return *this; + } + + Attr& append_scale_binary( + dnnl::algorithm algo, + at::Tensor binary, + float scale, + float sum_q_scale = 1.f, + int64_t zp = 0) { + ops_params_.push_back(PostOpParam( + binary, /*scale_sum*/ scale * sum_q_scale, algo, kind_t::binary)); + return *this; + } + + // append bias with binary_add method (only used for QConv now) + template + Attr& append_bias(const at::Tensor& binary) { + // In PyTorch, bias are in shape of [OC], + // we expand its shape according to Conv dimension + // Conv1d [OC, 1, 1], Conv2d [1, OC, 1, ,1], Conv3d [1, OC, 1, 1, 1] + at::Tensor binary_ = binary.contiguous(); + dnnl::memory::desc binary_md; + switch (N) { + case 1: + binary_md = dnnl::memory::desc( + {binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abc); + break; + case 2: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcd); + break; + case 3: + binary_md = dnnl::memory::desc( + {1, binary.size(0), 1, 1, 1}, + dnnl::memory::data_type::f32, + dnnl::memory::format_tag::abcde); + break; + default: + TORCH_INTERNAL_ASSERT(0, + "XPU only supports append_bias for Conv1d, Conv2d and Conv3d."); + } + // In this case, expected_md = binary_md + ops_params_.push_back(PostOpParam( + binary_, binary_md, binary_md, kind_with_binary_add, kind_t::binary)); + return *this; + } + + // append prelu post op + Attr& append_post_prelu(int mask) { + ops_params_.push_back(PostOpParam(mask, kind_t::prelu)); + return *this; + } + + dnnl::post_ops extract_post_ops(const at::Tensor& dst){ + // this function is used to extract post ops params from the ops_params_ + // and put them into onednn post ops + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + switch (kind) { + case kind_t::eltwise: { + dnnl::algorithm algo = ops_params_[i].algo_; + float alpha = ops_params_[i].alpha_; + float beta = ops_params_[i].beta_; + dnnl_post_ops_.append_eltwise(algo, alpha, beta); + break; + } + case kind_t::sum: { + float scale = ops_params_[i].scale_; + // TODO [Asymmetric]: + // Post-sum zp for gpu is not supported currently + dnnl_post_ops_.append_sum(scale); + break; + } + case kind_t::binary: { + dnnl::algorithm algo = ops_params_[i].algo_; + auto expected_md = ops_params_[i].expected_meta_; + // In this case user may create src1 memory descriptor with + // format_tag::any or set a specific tag. However, in later case if + // tags mismatch with dst, it would result in suboptimal performance. + // So here we use format_tag::any to make sure the fast can be + // selected. + // Thus we use expected_md (with format_any) here to create pd instead + // of original md + dnnl_post_ops_.append_binary(algo, expected_md); + break; + } + default: + break; + } + } + + // if output is quantized, then append the eltwise linear to adjust the + // output scale/zero_point + if (dst.is_quantized()) { + // [Note: Gap of u8 qtensor scale between oneDNN and PyTorch] + // The /2 here is for output_scale collected by observer is different + // from quantization requirements in oneDNN. + // For Observer, the conv_scale (activation scale in other case) is + // computed through 2max_v/(qmax - qmin). The max_v is collected + // from the tensor to be observerd. + // (https://pytorch.org/docs/stable/generated/torch.quantization.observer.MinMaxObserver.html#torch.quantization.observer.MinMaxObserver) + // On the other hand, for u8 in oneDNN, the scale for quantization is + // defined as max_v/(qmax-qmin). Hence, we need to divide by 2 here. + // (https://oneapi-src.github.io/oneDNN/dev_guide_inference_int8.html) + dnnl_post_ops_.append_eltwise( + kind_with_linear, 1.f / q_scale_, q_zero_point_); + } + return dnnl_post_ops_; + } + + bool with_sum() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::sum) { + return true; + } + } + return false; + } + + bool with_binary() { + for (size_t i = 0; i < ops_params_.size(); ++i) { + if (ops_params_[i].kind_ == kind_t::binary) { + return true; + } + } + return false; + } + + void construct_post_binary( + dnnl::primitive_desc& pd, + std::unordered_map& args) { + // This function is used to construct binary memory desc in binary post ops. + // According to oneDNN doc, the binary tensor can be in shape of + // [1, 1, 1, 1], tensor broadcast + // [1, C, 1, 1], channel broadcast + // [dst.shape], no broadcast and eltwise-wise binary operations on dst + + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + for (size_t i = 0; i < ops_params_.size(); ++i) { + kind_t kind = ops_params_[i].kind_; + if (kind == kind_t::binary) { + dnnl::memory binary_m; + auto binary = ops_params_[i].binary_; + auto md = ops_params_[i].meta_; + // qeury expected_md to achieve peak performance + auto expected_md = pd.query_md( + dnnl::query::exec_arg_md, + DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1); + + binary_m = at::native::onednn::make_onednn_memory( + md, engine, binary.data_ptr() + ); + + args.insert( + {DNNL_ARG_ATTR_MULTIPLE_POST_OP(i) | DNNL_ARG_SRC_1, binary_m}); + } + } + } + + float q_scale_ = 1.0; // the scale used to quantize the fused result from fp32 + // to int8, only works for int8 case + int64_t q_zero_point_ = 0; + std::vector ops_params_; // series of post ops + dnnl::post_ops dnnl_post_ops_; +}; + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp new file mode 100644 index 0000000000000..87ddd0af34fe9 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Conv.cpp @@ -0,0 +1,451 @@ +#include + +#include +#include +#include +#include + +#include +#include + +#include + +namespace at::native::onednn { + +constexpr int src_batch_size_dim = 0; +constexpr int weight_dst_channels_dim = 0; + +dnnl::memory::dims conv_dst_size( + int64_t ndim, + IntArrayRef src_size, + IntArrayRef weight_size, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation) { + bool has_dilation = dilation.size() > 0; + dnnl::memory::dims dst_size(ndim); + dst_size[0] = src_size[src_batch_size_dim]; + dst_size[1] = weight_size[weight_dst_channels_dim]; + for (int d = 2; d < ndim; ++d) { + auto dilate = has_dilation ? dilation[d - 2] : 1; + auto kernel = dilate * (weight_size[d] - 1) + 1; + dst_size[d] = + (src_size[d] + + (padding_front_top_left[d - 2] + padding_back_bottom_right[d - 2]) - + kernel) / + stride[d - 2] + + 1; + } + return dst_size; +} + +static inline dnnl::memory::dims compatible_dilation(IntArrayRef& dilation) { + dnnl::memory::dims ret = dilation.vec(); + for (auto it = ret.begin(); it != ret.end(); it++) { + *it -= 1; + } + return ret; +} + +static inline dnnl::memory::format_tag conv_src_fmt( + const int64_t ndim, + const bool is_channels_last = false) { + if (!is_channels_last) { + return (ndim == 3) + ? dnnl::memory::format_tag::ncw + : ((ndim == 4) ? dnnl::memory::format_tag::nchw + : ((ndim == 5) ? dnnl::memory::format_tag::ncdhw + : dnnl::memory::format_tag::undef)); + } else { + return (ndim == 3) + ? dnnl::memory::format_tag::nwc + : ((ndim == 4) ? dnnl::memory::format_tag::nhwc + : ((ndim == 5) ? dnnl::memory::format_tag::ndhwc + : dnnl::memory::format_tag::undef)); + } +} + +static inline dnnl::memory::format_tag conv_weight_fmt( + const int64_t ndim, + const bool grouped = false, + const bool is_channels_last = false) { + if (!is_channels_last) { + return (ndim == 3) + ? (grouped ? dnnl::memory::format_tag::goiw : dnnl::memory::format_tag::oiw) + : (ndim == 4) + ? (grouped ? dnnl::memory::format_tag::goihw : dnnl::memory::format_tag::oihw) + : ((ndim == 5) ? (grouped ? dnnl::memory::format_tag::goidhw + : dnnl::memory::format_tag::oidhw) + : dnnl::memory::format_tag::undef); + } else { + return (ndim == 3) + ? (grouped ? dnnl::memory::format_tag::gowi : dnnl::memory::format_tag::owi) + : (ndim == 4) + ? (grouped ? dnnl::memory::format_tag::gohwi : dnnl::memory::format_tag::ohwi) + : ((ndim == 5) ? (grouped ? dnnl::memory::format_tag::godhwi + : dnnl::memory::format_tag::odhwi) + : dnnl::memory::format_tag::undef); + } +} + +static inline dnnl::memory::dims compatible_weight_dims( + const int64_t ndim, + const int64_t groups, + const int64_t oc, + const int64_t ic, + const IntArrayRef wsizes) { + if (ndim == 3) { + auto kw = wsizes[2]; + return (groups != 1) ? dnnl::memory::dims({groups, oc / groups, ic / groups, kw}) + : dnnl::memory::dims({oc, ic, kw}); + } else if (ndim == 4) { + auto kh = wsizes[2]; + auto kw = wsizes[3]; + return (groups != 1) + ? dnnl::memory::dims({groups, oc / groups, ic / groups, kh, kw}) + : dnnl::memory::dims({oc, ic, kh, kw}); + } else if (ndim == 5) { + auto kd = wsizes[2]; + auto kh = wsizes[3]; + auto kw = wsizes[4]; + return (groups != 1) + ? dnnl::memory::dims({groups, oc / groups, ic / groups, kd, kh, kw}) + : dnnl::memory::dims({oc, ic, kd, kh, kw}); + } + + return {}; +} + +static std::tuple< + dnnl::memory::desc, + dnnl::memory::desc, + dnnl::memory::desc> + conv_get_md( + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& dst, + int64_t groups, + bool is_channels_last) { + // create memory desc from the src/weight/dst tensors + dnnl::memory::desc src_usr_md, weight_usr_md, dst_usr_md; + auto ndim = src.ndimension(); + auto fmt_src = + conv_src_fmt(ndim, is_channels_last); + + auto src_size = src.sizes().vec(); + auto src_data_t = get_onednn_dtype_include_double(src); + src_usr_md = dnnl::memory::desc(src_size, src_data_t, fmt_src); + + auto dst_size = dst.sizes().vec(); + auto dst_data_t = get_onednn_dtype_include_double(dst); + dst_usr_md = dnnl::memory::desc(dst_size, dst_data_t, fmt_src); + + auto ic = src.size(1); + auto oc = dst.size(1); + auto wei_data_t = get_onednn_dtype_include_double(weight); + dnnl::memory::dims weight_size = + compatible_weight_dims(ndim, groups, oc, ic, weight.sizes()); + auto fmt_weight = conv_weight_fmt( + ndim, + groups != 1, + is_channels_last); + weight_usr_md = dnnl::memory::desc(weight_size, wei_data_t, fmt_weight); + + return {src_usr_md, weight_usr_md, dst_usr_md}; +} + +sycl::event convolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last = use_channels_last_for_conv(src, weight, false); + + // create usr_md for tensors, and md for conv primitive + dnnl::memory::desc src_md, weight_md, dst_md; + std::tie(src_md, weight_md, dst_md) = conv_get_md(src, weight, dst, groups, is_channels_last); + + auto bia_fmt = dnnl::memory::format_tag::x; + auto bia_md = bia.defined() + ? dnnl::memory::desc( + {dst.size(1)}, get_onednn_dtype_include_double(bia), bia_fmt) + : dnnl::memory::desc(); + + // create conv primitive descriptor + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec(); + dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec(); + dnnl::memory::dims _dilation = compatible_dilation(dilation); + + // extract post ops + dnnl::primitive_attr pattr; + dnnl::post_ops po = attr.extract_post_ops(dst); + pattr.set_post_ops(po); + + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::convolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding_front_top_left, + _padding_back_bottom_right, + pattr); + + dnnl::memory src_m, weight_m, dst_m, bia_m; + at::Tensor src_blocked, weight_blocked, dst_blocked = dst; + + src_m = make_onednn_memory(src_md, engine, src.data_ptr()); + weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr()); + dst_m = make_onednn_memory(dst_md, engine, dst.data_ptr()); + + + std::unordered_map args; + if (bia.defined()) { + bia_m = make_onednn_memory(bia_md, engine, bia.data_ptr()); + args.insert({DNNL_ARG_BIAS, bia_m}); + } + auto expected_dst_md = conv_fwd_pd.dst_desc(); + if (attr.with_binary()) + attr.construct_post_binary(conv_fwd_pd, args); + + args.insert({DNNL_ARG_SRC, src_m}); + args.insert({DNNL_ARG_WEIGHTS, weight_m}); + args.insert({DNNL_ARG_DST, dst_m}); + + size_t scratchpad_size = conv_fwd_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_m = make_onednn_memory( + conv_fwd_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m}); + + auto conv_forward = dnnl::convolution_forward(conv_fwd_pd); + auto conv_fwd_event = dnnl::sycl_interop::execute(conv_forward, stream, args, deps); + + return conv_fwd_event; +} + +sycl::event convolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef diff_weight_aten_size, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last = use_channels_last_for_conv(src, diff_dst, /*is_transposed=*/false); + + // create dnnl::memory desc + dnnl::memory::desc src_md, weight_md, dst_md; + std::tie(src_md, weight_md, dst_md) = + conv_get_md(src, diff_weight, diff_dst, groups, is_channels_last); + dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x; + auto bia_md = diff_bia.defined() + ? dnnl::memory::desc({diff_dst.size(1)}, src_md.get_data_type(), bia_fmt) + : dnnl::memory::desc(); + + // create fwd primitive hint + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec(); + dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec(); + dnnl::memory::dims _dilation = compatible_dilation(dilation); + dnnl::primitive_attr pattr; + + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + auto conv_fwd_pd = dnnl::convolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::convolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding_front_top_left, + _padding_back_bottom_right, + pattr); + + // create bwd weight primitive + auto conv_bwd_w_pd = dnnl::convolution_backward_weights::primitive_desc( + engine, + dnnl::algorithm::convolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding_front_top_left, + _padding_back_bottom_right, + conv_fwd_pd, + pattr); + + // create bwd memory + at::Tensor expected_src, expected_diff_dst, expected_diff_weight; + dnnl::memory src_m, diff_dst_m, diff_weight_m; + + src_m = make_onednn_memory(src_md, engine, src.data_ptr()); + diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr()); + diff_weight_m = make_onednn_memory(weight_md, engine, diff_weight.data_ptr()); + + // insert args + std::unordered_map args; + args.insert({DNNL_ARG_DIFF_DST, diff_dst_m}); + args.insert({DNNL_ARG_SRC, src_m}); + args.insert({DNNL_ARG_DIFF_WEIGHTS, diff_weight_m}); + if (diff_bia.defined()) { + dnnl::memory diff_bia_m = + make_onednn_memory(bia_md, engine, diff_bia.data_ptr()); + args.insert({DNNL_ARG_DIFF_BIAS, diff_bia_m}); + } + + size_t scratchpad_size = conv_bwd_w_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_m = make_onednn_memory( + conv_bwd_w_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m}); + + // execute primitive + auto conv_bwd_w = dnnl::convolution_backward_weights(conv_bwd_w_pd); + sycl::event conv_bwd_w_event = dnnl::sycl_interop::execute(conv_bwd_w, stream, args, deps); + + return conv_bwd_w_event; +} + +sycl::event convolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last = use_channels_last_for_conv(diff_dst, weight, /*is_transposed=*/false); + + // create memory desc + dnnl::memory::desc src_md, weight_md, dst_md; + std::tie(src_md, weight_md, dst_md) = + conv_get_md(diff_src, weight, diff_dst, groups, is_channels_last); + dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x; + auto bia_md = bias_defined + ? dnnl::memory::desc({diff_dst.size(1)}, weight_md.get_data_type(), bia_fmt) + : dnnl::memory::desc(); + + // create fwd primitive desc hint + dnnl::primitive_attr pattr; + + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding_front_top_left = padding_front_top_left.vec(); + dnnl::memory::dims _padding_back_bottom_right = padding_back_bottom_right.vec(); + dnnl::memory::dims _dilation = compatible_dilation(dilation); + auto conv_forward_pd = dnnl::convolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::convolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding_front_top_left, + _padding_back_bottom_right, + pattr); + + auto conv_backward_data_pd = dnnl::convolution_backward_data::primitive_desc( + engine, + dnnl::algorithm::convolution_direct, + src_md, + weight_md, + dst_md, + _stride, + _dilation, + _padding_front_top_left, + _padding_back_bottom_right, + conv_forward_pd, + pattr); + + // create memory + at::Tensor expected_src, expected_wei, expected_dst; + dnnl::memory diff_dst_m, wei_m, diff_src_m; + + diff_src_m = make_onednn_memory(src_md, engine, diff_src.data_ptr()); + wei_m = make_onednn_memory(weight_md, engine, weight.data_ptr()); + diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr()); + + + // insert args + std::unordered_map args; + size_t scratchpad_size = conv_backward_data_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, diff_dst.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_memory = make_onednn_memory( + conv_backward_data_pd.scratchpad_desc(), + engine, + scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory}); + args.insert({DNNL_ARG_DIFF_DST, diff_dst_m}); + args.insert({DNNL_ARG_WEIGHTS, wei_m}); + args.insert({DNNL_ARG_DIFF_SRC, diff_src_m}); + + // execute primitive + auto conv_backward_data = + dnnl::convolution_backward_data(conv_backward_data_pd); + auto conv_backward_data_event = dnnl::sycl_interop::execute(conv_backward_data, stream, args, deps); + return conv_backward_data_event; + +} + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp new file mode 100644 index 0000000000000..b8465c62c7e20 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Deconv.cpp @@ -0,0 +1,435 @@ +#include +#include + +#include +#include +#include +#include + +namespace at::native::onednn { + +static inline dnnl::memory::dims deconv_compatible_dilation(IntArrayRef& dilation) { + dnnl::memory::dims ret = dilation.vec(); + for (auto it = ret.begin(); it != ret.end(); it++) { + *it -= 1; + } + return ret; +} + +static inline std::vector compatible_groups_deconv_strides( + const at::Tensor& weight, + dnnl::memory::dims group_size) { + std::vector strides = weight.strides().vec(); + strides[0] = weight.strides()[1]; + strides[1] = weight.strides()[0]; + strides.insert(strides.begin(), group_size[2] * weight.strides()[0]); + return strides; +} + +dnnl::memory::dims deconv_dst_size( + IntArrayRef src_size, + IntArrayRef weight_size, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + IntArrayRef dst_padding, + int64_t groups) { + auto dim = src_size.size(); + dnnl::memory::dims dst_size(dim); + auto kernel_size = weight_size.slice(2); + + dst_size[0] = src_size[0]; + dst_size[1] = weight_size[1] * groups; + for (size_t d = 2; d < dim; ++d) { + dst_size[d] = (src_size[d] - 1) * stride[d - 2] - 2 * padding[d - 2] + + (dilation[d - 2] * (kernel_size[d - 2] - 1) + 1) + dst_padding[d - 2]; + } + return dst_size; +} + +static inline dnnl::memory::format_tag deconv_src_fmt( + const int64_t ndim, + const bool is_channels_last = false) { + // 3D: n/c/w (n/w/c) [a/b/c (a/c/b)] + // 4D: n/c/h/w (n/h/w/c) [a/b/c/d (a/c/d/b)] + // 5D: n/c/d/h/w (n/d/h/w/c) [a/b/c/d/e (a/c/d/e/b)] + if (!is_channels_last) { + return (ndim == 3) + ? dnnl::memory::format_tag::ncw + : ((ndim == 4) ? dnnl::memory::format_tag::nchw + : ((ndim == 5) ? dnnl::memory::format_tag::ncdhw + : dnnl::memory::format_tag::undef)); + } else { + return (ndim == 3) + ? dnnl::memory::format_tag::nwc + : ((ndim == 4) ? dnnl::memory::format_tag::nhwc + : ((ndim == 5) ? dnnl::memory::format_tag::ndhwc + : dnnl::memory::format_tag::undef)); + } +} + +static inline std::vector deconv_weight_fmt( + const at::Tensor& weight, + const int64_t ndim, + dnnl::memory::dims weight_size, + const bool grouped = false, + const bool is_channels_last = false) { + // 3D fmt: (g)i/o/w ((g)i/w/o) [b/a/c (b/c/a)] + // 4D fmt: (g)i/o/h/w ((g)i/h/w/o) [b/a/c/d (b/c/d/a)] + // 5D fmt: (g)i/o/d/h/w ((g)i/d/h/w/o) [b/a/c/d/e (b/c/d/e/a)] + auto strides_ = weight.strides().vec(); + std::vector strides; + if (grouped) { + strides = compatible_groups_deconv_strides(weight, weight_size); + } else { + strides = strides_; + std::swap(strides[0], strides[1]); + } + return strides; +} + +static inline dnnl::memory::dims deconv_compatible_weight_dims( + int64_t ndim, + int64_t groups, + int64_t oc, + int64_t ic, + IntArrayRef weight_size) { + if (ndim == 3) { + auto kw = weight_size[2]; + return (groups != 1) ? dnnl::memory::dims({groups, oc / groups, ic / groups, kw}) + : dnnl::memory::dims({oc, ic, kw}); + } else if (ndim == 4) { + auto kh = weight_size[2]; + auto kw = weight_size[3]; + return (groups != 1) + ? dnnl::memory::dims({groups, oc / groups, ic / groups, kh, kw}) + : dnnl::memory::dims({oc, ic, kh, kw}); + } else if (ndim == 5) { + auto kd = weight_size[2]; + auto kh = weight_size[3]; + auto kw = weight_size[4]; + return (groups != 1) + ? dnnl::memory::dims({groups, oc / groups, ic / groups, kd, kh, kw}) + : dnnl::memory::dims({oc, ic, kd, kh, kw}); + } else { + TORCH_CHECK(0, "unsupported dimension in xpu oneDNN deconvolution..."); + } +} + +static std::tuple< + dnnl::memory::desc, + dnnl::memory::desc, + dnnl::memory::desc> +deconv_get_plain_md( + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& dst, + int64_t groups, + bool is_channels_last_suggested) { + auto ndim = src.ndimension(); + auto src_data_t = get_onednn_dtype_include_double(src); + auto fmt_src = deconv_src_fmt(ndim, is_channels_last_suggested); + auto src_usr_md = dnnl::memory::desc(src.sizes().vec(), src_data_t, fmt_src); + + auto dst_data_t = get_onednn_dtype_include_double(dst); + auto dst_usr_md = dnnl::memory::desc(dst.sizes().vec(), dst_data_t, fmt_src); + + auto ic = src.size(1); + auto oc = dst.size(1); + dnnl::memory::dims weight_size = + deconv_compatible_weight_dims(ndim, groups, oc, ic, weight.sizes()); + auto weight_dt = get_onednn_dtype_include_double(weight); + auto fmt_weight = deconv_weight_fmt( + weight, ndim, weight_size, groups != 1, is_channels_last_suggested); + dnnl::memory::desc weight_usr_md = dnnl::memory::desc(weight_size, weight_dt, fmt_weight); + + return {src_usr_md, weight_usr_md, dst_usr_md}; +} + +sycl::event deconvolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last_suggested = use_channels_last_for_conv(src, weight, /*is_transposed=*/true); + + // create usr_md for tensors, and md for conv primitive + dnnl::memory::desc src_md, weight_md, dst_md; + + std::tie(src_md, weight_md, dst_md) = + deconv_get_plain_md(src, weight, dst, groups, is_channels_last_suggested); + + dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x; + auto bia_md = bia.defined() + ? dnnl::memory::desc( + {dst.size(1)}, get_onednn_dtype_include_double(bia), bia_fmt) + : dnnl::memory::desc(); + + // create primitive desc + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding = padding.vec(); + dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation); + + // construct primitive attr + dnnl::primitive_attr pattr; + dnnl::post_ops po = attr.extract_post_ops(dst); + pattr.set_post_ops(po); + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::deconvolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding, + _padding, + pattr); + + dnnl::memory src_m, weight_m, dst_m, bia_m; + at::Tensor src_blocked, weight_blocked, dst_blocked = dst; + + src_m = make_onednn_memory(src_md, engine, src.data_ptr()); + weight_m = make_onednn_memory(weight_md, engine, weight.data_ptr()); + dst_m = make_onednn_memory(dst_md, engine, dst.data_ptr()); + + std::unordered_map args; + args.insert({DNNL_ARG_SRC, src_m}); + args.insert({DNNL_ARG_WEIGHTS, weight_m}); + args.insert({DNNL_ARG_DST, dst_m}); + + if (bia.defined()) { + auto bia_m = make_onednn_memory(bia_md, engine, bia.data_ptr()); + args.insert({DNNL_ARG_BIAS, bia_m}); + } + if (attr.with_binary()) + attr.construct_post_binary(deconv_fwd_pd, args); + + size_t scratchpad_size = deconv_fwd_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_m = make_onednn_memory( + deconv_fwd_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m}); + + auto deconv_fwd = dnnl::deconvolution_forward(deconv_fwd_pd); + sycl::event deconv_event = dnnl::sycl_interop::execute(deconv_fwd, stream, args, deps); + return deconv_event; + +} + +sycl::event deconvolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last_suggested = + use_channels_last_for_conv(diff_dst, weight, /*is_transposed=*/true); + // create memory desc + dnnl::memory::desc src_md, weight_md, dst_md; + std::tie(src_md, weight_md, dst_md) = + deconv_get_plain_md( + diff_src, weight, diff_dst, groups, is_channels_last_suggested); + + dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x; + auto bias_md = bias_defined + ? dnnl::memory::desc({diff_dst.size(1)}, weight_md.get_data_type(), bia_fmt) + : dnnl::memory::desc(); + + // create fwd primitive desc hint + dnnl::primitive_attr pattr; + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding = padding.vec(); + dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation); + auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::deconvolution_direct, + src_md, + weight_md, + bias_md, + dst_md, + _stride, + _dilation, + _padding, + _padding, + pattr); + + // create bwd primitive desc + auto deconv_backward_data_pd = dnnl::deconvolution_backward_data::primitive_desc( + engine, + dnnl::algorithm::deconvolution_direct, + src_md, + weight_md, + dst_md, + _stride, + _dilation, + _padding, + _padding, + deconv_fwd_pd); + + // create memory + dnnl::memory diff_dst_m, wei_m, diff_src_m; + + diff_src_m = make_onednn_memory(src_md, engine, diff_src.data_ptr()); + wei_m = make_onednn_memory(weight_md, engine, weight.data_ptr()); + diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr()); + + // insert args + std::unordered_map args; + size_t scratchpad_size = deconv_backward_data_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, diff_dst.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_memory = make_onednn_memory( + deconv_backward_data_pd.scratchpad_desc(), + engine, + scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory}); + args.insert({DNNL_ARG_DIFF_DST, diff_dst_m}); + args.insert({DNNL_ARG_WEIGHTS, wei_m}); + args.insert({DNNL_ARG_DIFF_SRC, diff_src_m}); + + // execute primitive + auto deconv_backward_data = + dnnl::deconvolution_backward_data(deconv_backward_data_pd); + sycl::event deconv_bwd_data_event = dnnl::sycl_interop::execute(deconv_backward_data, stream, args, deps); + return deconv_bwd_data_event; + +} + +sycl::event deconvolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps) { + auto engine = + GpuEngineManager::Instance().get_engine({c10::kXPU, c10::xpu::current_device()}); + auto stream = GpuStreamManager::Instance().get_stream(); + + bool is_channels_last_suggested = + use_channels_last_for_conv(src, diff_dst, /*is_transposed=*/true); + + // create memory desc + dnnl::memory::desc src_md, weight_md, dst_md; + std::tie(src_md, weight_md, dst_md) = deconv_get_plain_md( + src, diff_weight, diff_dst, groups, is_channels_last_suggested); + + dnnl::memory::format_tag bia_fmt = dnnl::memory::format_tag::x; + auto bia_md = diff_bia.defined() + ? dnnl::memory::desc({diff_dst.size(1)}, src_md.get_data_type(), bia_fmt) + : dnnl::memory::desc(); + + // create fwd primitive desc hint + dnnl::memory::dims _stride = stride.vec(); + dnnl::memory::dims _padding = padding.vec(); + dnnl::memory::dims _dilation = deconv_compatible_dilation(dilation); + dnnl::primitive_attr pattr; + + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + auto deconv_fwd_pd = dnnl::deconvolution_forward::primitive_desc( + engine, + dnnl::prop_kind::forward, + dnnl::algorithm::deconvolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding, + _padding, + pattr); + + auto deconv_bwd_w_pd = dnnl::deconvolution_backward_weights::primitive_desc( + engine, + dnnl::algorithm::deconvolution_direct, + src_md, + weight_md, + bia_md, + dst_md, + _stride, + _dilation, + _padding, + _padding, + deconv_fwd_pd, + pattr); + + // create bwd dnnl::memory + dnnl::memory src_m, diff_dst_m, diff_weight_m; + + src_m = make_onednn_memory(src_md, engine, src.data_ptr()); + diff_dst_m = make_onednn_memory(dst_md, engine, diff_dst.data_ptr()); + diff_weight_m = make_onednn_memory(weight_md, engine, diff_weight.data_ptr()); + + // insert args + std::unordered_map args; + args.insert({DNNL_ARG_DIFF_DST, diff_dst_m}); + args.insert({DNNL_ARG_SRC, src_m}); + args.insert({DNNL_ARG_DIFF_WEIGHTS, diff_weight_m}); + + if (diff_bia.defined()) { + dnnl::memory diff_bia_m = + make_onednn_memory(bia_md, engine, diff_bia.data_ptr()); + args.insert({DNNL_ARG_DIFF_BIAS, diff_bia_m}); + } + + size_t scratchpad_size = deconv_bwd_w_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, src.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_m = make_onednn_memory( + deconv_bwd_w_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_m}); + + // execute primitive + auto deconv_bwd_w = dnnl::deconvolution_backward_weights(deconv_bwd_w_pd); + + sycl::event deconv_bwd_w_event = dnnl::sycl_interop::execute(deconv_bwd_w, stream, args, deps); + return deconv_bwd_w_event; + +} + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp new file mode 100644 index 0000000000000..7dfd31b93ba8d --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Matmul.cpp @@ -0,0 +1,244 @@ + +#include + +#include +#include + +#include +#include + +#include + +namespace at::native::onednn { + +sycl::event matmul( + at::Tensor& result, + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& b_raw, + bool m2_trans, + Attr attr, + const std::vector& deps) { + int64_t dims = result.dim(); + TORCH_CHECK( + dims == 2 || dims == 3, + "oneDNN matmul only works with 2D or 3D, got ", + dims); + TORCH_CHECK( + dims == mat1.dim() && dims == mat2.dim(), + "oneDNN input matrixes must have the same ranks"); + TORCH_CHECK(result.defined(), "oneDNN matmul result should be defined"); + + at::Device cur_device = at::Device(at::kXPU, c10::xpu::current_device()); + auto engine = GpuEngineManager::Instance().get_engine(cur_device); + auto stream = GpuStreamManager::Instance().get_stream(); + + at::Tensor m1 = is_onednn_matmul_strides(mat1) ? mat1 : mat1.contiguous(); + at::Tensor m2 = is_onednn_matmul_strides(mat2) ? mat2 : mat2.contiguous(); + at::Tensor dst = is_onednn_matmul_strides(result, true) ? result : result.contiguous(); + + int64_t m = dst.size(-2); + int64_t n = dst.size(-1); + int64_t k = m1.size(-1); + int64_t mb = 1; + + if (dims == 3) { + mb = dst.size(0); + TORCH_CHECK( + mb == m1.size(0) && mb == m2.size(0), + "batch size mismatch, dst mb: ", + mb, + "m1 mb", + m1.size(0), + " m2 mb: ", + m2.size(0)); + } + + // validate bias and make it compatible with oneDNN implementation + bool with_bias = false; + at::Tensor b = b_raw; + if (b.defined()) { + with_bias = true; + if (b.dim() == 1) { + TORCH_CHECK( + b.size(0) == n || b.size(0) == 1, + "matmul supports [n] or [1] when bias dim is 1 ..."); + if (b.size(0) == 0) { + with_bias = false; + } else if (m1.dim() == 3) { + b = b.expand({mb, m, n}).contiguous(); + } else if (m1.dim() == 2) { + b = b.expand({1, n}).contiguous(); + } + } else if (b.dim() == 2) { + TORCH_CHECK( + (b.size(0) == m && b.size(1) == n) || + (b.size(0) == 1 && b.size(1) == n) || + (b.size(0) == m && b.size(1) == 1) || + (b.size(0) == 1 && b.size(1) == 1), + "matmul supports [m, n] or [1, n] or [m, 1] or [1, 1] when bias dim is 2 ..."); + if (b.size(0) == 1 && b.size(1) == 1) + b = b.expand({1, n}).contiguous(); + } else if (b.dim() == 3) { + TORCH_CHECK( + at::are_expandable({mb, m, n}, b.sizes()), + "matmul bias must be expandable to:", + dst.sizes(), + " but got:", + b.sizes()); + b = b.expand({mb, m, n}).contiguous(); + } else if (b.dim() == 0) { + TORCH_CHECK( + b.numel() == 1, "matmul supports 1 numel when bias dim is [] ..."); + if (m1.dim() == 3) { + b = b.expand({mb, m, n}).contiguous(); + } else { + b = b.expand({1, n}).contiguous(); + } + } else { + TORCH_CHECK(0, "unsupported bias dim in matmul ..."); + } + } + + b = b.contiguous(); // avoid reorder 2 times + + // xpu matmul support both ab/ba shape for m2 tensor, we don't check any more + auto m1_usr_dt = get_onednn_dtype(m1); + auto m2_usr_dt = get_onednn_dtype(m2); + auto dst_usr_dt = get_onednn_dtype(dst); + + auto m1_dt = m1_usr_dt; + auto m2_dt = m2_usr_dt; + auto dst_dt = dst_usr_dt; + dnnl::memory::data_type bias_dt; + + dnnl::memory::desc m1_md, m1_usr_md, m1_any_md; + dnnl::memory::desc m2_md, m2_usr_md, m2_any_md; + dnnl::memory::desc dst_md, dst_usr_md, dst_any_md; + dnnl::memory::desc bias_md; + + // Naive Master weight + if (m1_dt == dnnl::memory::data_type::bf16 && m2_dt == dnnl::memory::data_type::f32) { + m2_dt = dnnl::memory::data_type::bf16; + dst_dt = dnnl::memory::data_type::bf16; + } else if ( + m1_dt == dnnl::memory::data_type::f32 && m2_dt == dnnl::memory::data_type::bf16) { + m1_dt = dnnl::memory::data_type::bf16; + dst_dt = dnnl::memory::data_type::bf16; + } + + dnnl::memory::dims m1_dims, m2_dims, dst_dims, bias_dims; + dnnl::memory::dims m1_strides, m2_strides, dst_strides, bias_strides; + if (dims == 2) { + m1_dims = {m, k}; + m2_dims = {k, n}; + dst_dims = {m, n}; + + m1_strides = {m1.stride(0), m1.stride(1)}; + if (m2_trans) { + m2_strides = {m2.stride(0), m2.stride(1)}; + } else { + m2_strides = {m2.stride(1), m2.stride(0)}; + } + dst_strides = {dst.stride(0), dst.stride(1)}; + } else { + m1_dims = {mb, m, k}; + m2_dims = {mb, k, n}; + dst_dims = {mb, m, n}; + + m1_strides = {m1.stride(0), m1.stride(1), m1.stride(2)}; + if (m2_trans) { + m2_strides = {m2.stride(0), m2.stride(1), m2.stride(2)}; + } else { + m2_strides = {m2.stride(0), m2.stride(2), m2.stride(1)}; + } + dst_strides = {dst.stride(0), dst.stride(1), dst.stride(2)}; + } + + if (with_bias) { + bias_dims = get_onednn_dims(b); + bias_dt = get_onednn_dtype(b); + bias_strides = get_onednn_strides(b); + } + + dnnl::post_ops po = attr.extract_post_ops(dst); + + std::unordered_map args; + dnnl::matmul matmul_p; + dnnl::matmul::primitive_desc matmul_pd; + + // STEP1: create memory desc + m1_md = dnnl::memory::desc(m1_dims, m1_dt, m1_strides); + m2_md = dnnl::memory::desc(m2_dims, m2_dt, m2_strides); + dst_md = dnnl::memory::desc(dst_dims, dst_dt, dst_strides); + + // STEP2: creat attribute + dnnl::primitive_attr pattr; + pattr.set_post_ops(po); + + #if ONEDNN_SUPPORT_DETERMINISTIC + if(at::globalContext().deterministicAlgorithms()) + pattr.set_deterministic(true); + #endif + + // scratchpad + pattr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + + if (m1_dt == dnnl::memory::data_type::f32) { + pattr.set_fpmath_mode(dnnl::fpmath_mode::strict); + } + + // STEP3: create primitive + if (with_bias) { + bias_md = dnnl::memory::desc(bias_dims, bias_dt, bias_strides); + matmul_pd = + dnnl::matmul::primitive_desc(engine, m1_md, m2_md, bias_md, dst_md, pattr); + } else { + matmul_pd = dnnl::matmul::primitive_desc(engine, m1_md, m2_md, dst_md, pattr); + } + + matmul_p = dnnl::matmul(matmul_pd); + + m1_usr_md = dnnl::memory::desc(m1_dims, m1_usr_dt, m1_strides); + m2_usr_md = dnnl::memory::desc(m2_dims, m2_usr_dt, m2_strides); + dst_usr_md = dnnl::memory::desc(dst_dims, dst_usr_dt, dst_strides); + + // STEP4: create memory + auto m1_usr_m = make_onednn_memory(m1_usr_md, engine, m1.data_ptr()); + auto m2_usr_m = make_onednn_memory(m2_usr_md, engine, m2.data_ptr()); + auto dst_usr_m = make_onednn_memory(dst_usr_md, engine, dst.data_ptr()); + + auto expected_m1_md = matmul_pd.src_desc(); + auto expected_m2_md = matmul_pd.weights_desc(); + auto expected_dst_md = matmul_pd.dst_desc(); + + dnnl::memory m1_m = m1_usr_m, m2_m = m2_usr_m, dst_m = dst_usr_m; + at::Tensor m1_, m2_, dst_; + + if (attr.with_binary()) + attr.construct_post_binary(matmul_pd, args); + + size_t scratchpad_size = matmul_pd.scratchpad_desc().get_size(); + at::Tensor scratchpad_tensor = at::empty( + {static_cast(scratchpad_size)}, m1.options().dtype(at::kByte), c10::nullopt); + auto scratchpad_memory = make_onednn_memory( + matmul_pd.scratchpad_desc(), engine, scratchpad_tensor.data_ptr()); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad_memory}); + + args.insert({DNNL_ARG_SRC, m1_m}); + args.insert({DNNL_ARG_WEIGHTS, m2_m}); + args.insert({DNNL_ARG_DST, dst_m}); + if (with_bias) { + auto bias_m = make_onednn_memory(bias_md, engine, b.data_ptr()); + args.insert({DNNL_ARG_BIAS, bias_m}); + } + + sycl::event matmul_event = dnnl::sycl_interop::execute(matmul_p, stream, args, deps); + + if (!dst.is_same(result)) + result.copy_(dst); + + return matmul_event; +} + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp new file mode 100644 index 0000000000000..8dd3dc329c70f --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.cpp @@ -0,0 +1,380 @@ +#include + +namespace at::native::onednn { + +dnnl::memory make_onednn_memory( + dnnl::memory::desc md, + dnnl::engine& engine, + void* ptr){ + return dnnl::sycl_interop::make_memory( + md, + engine, + dnnl::sycl_interop::memory_kind::usm, + ptr == nullptr ? DNNL_MEMORY_ALLOCATE : ptr); +} + +dnnl::memory::format_tag get_dnnl_default_format( + int ndims, + bool is_channels_last, + bool allow_undef) { + switch (ndims) { + case 1: + return dnnl::memory::format_tag::a; + case 2: + return dnnl::memory::format_tag::ab; + case 3: + return is_channels_last ? dnnl::memory::format_tag::acb + : dnnl::memory::format_tag::abc; + case 4: + return is_channels_last ? dnnl::memory::format_tag::acdb + : dnnl::memory::format_tag::abcd; + case 5: + return is_channels_last ? dnnl::memory::format_tag::acdeb + : dnnl::memory::format_tag::abcde; + case 6: + return dnnl::memory::format_tag::abcdef; + case 7: + return dnnl::memory::format_tag::abcdefg; + case 8: + return dnnl::memory::format_tag::abcdefgh; + case 9: + return dnnl::memory::format_tag::abcdefghi; + case 10: + return dnnl::memory::format_tag::abcdefghij; + case 11: + return dnnl::memory::format_tag::abcdefghijk; + case 12: + return dnnl::memory::format_tag::abcdefghijkl; + default: + if (!allow_undef) { + TORCH_CHECK(false, "oneDNN doesn't support tensor dimension > 12"); + } + return dnnl::memory::format_tag::undef; + } +} + +dnnl::memory::data_type get_onednn_dtype( + const at::Tensor& tensor, + bool allow_undef) { + switch (tensor.scalar_type()) { + case at::ScalarType::Byte: + return dnnl::memory::data_type::u8; + case at::ScalarType::Char: + return dnnl::memory::data_type::s8; + case at::ScalarType::QInt8: + return dnnl::memory::data_type::s8; + case at::ScalarType::QUInt8: + return dnnl::memory::data_type::u8; + case at::ScalarType::Int: + return dnnl::memory::data_type::s32; + case at::ScalarType::Half: + return dnnl::memory::data_type::f16; + case at::ScalarType::Float: + return dnnl::memory::data_type::f32; + case at::ScalarType::BFloat16: + return dnnl::memory::data_type::bf16; + default: + if (!allow_undef) { + TORCH_CHECK( + false, + c10::toString(tensor.scalar_type()), + " is not supported in oneDNN!"); + } + return dnnl::memory::data_type::undef; + }; +} + +dnnl::memory::data_type get_onednn_dtype_include_double( + const at::Tensor& tensor, + bool allow_undef) { + if (tensor.scalar_type() == at::ScalarType::Double) + return dnnl::memory::data_type::f64; + return get_onednn_dtype(tensor, allow_undef); +} + +bool is_supported_onednn_dtype(const at::Tensor& tensor) { + return get_onednn_dtype(tensor, /*allow_undef*/ true) == + dnnl::memory::data_type::undef + ? false + : true; +} + +dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor) { + dnnl::memory::dims dims; + for (size_t i = 0; i < tensor.sizes().size(); i++) + dims.push_back(tensor.size(i)); + return dims; +} + +dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor) { + dnnl::memory::dims strides; + for (size_t i = 0; i < tensor.strides().size(); i++) + strides.push_back(tensor.stride(i)); + return strides; +} + +dnnl::memory::desc get_onednn_md(const at::Tensor& tensor) { + return { + get_onednn_dims(tensor), + get_onednn_dtype(tensor), + get_onednn_strides(tensor)}; +} + +bool onednn_strides_check(const at::Tensor& src) { + auto adims = get_onednn_dims(src); + int ndims = (int)adims.size(); + auto dims = adims.data(); + auto data_type = static_cast( + get_onednn_dtype(src, /*allow_undef*/ true)); + auto strides_info = get_onednn_strides(src); + auto strides = strides_info.empty() ? nullptr : &strides_info[0]; + + dnnl_memory_desc_t md; + dnnl_memory_desc_create_with_strides(&md, ndims, dims, data_type, strides); + dnnl_format_kind_t md_fmt_kind; + int md_ndims; + int md_inner_nblks; + dnnl_dims_t* md_padded_dims = nullptr; + + dnnl_memory_desc_query(md, dnnl_query_inner_nblks_s32, &md_inner_nblks); + dnnl_memory_desc_query(md, dnnl_query_format_kind, &md_fmt_kind); + dnnl_memory_desc_query(md, dnnl_query_ndims_s32, &md_ndims); + dnnl_memory_desc_query(md, dnnl_query_padded_dims, &md_padded_dims); + if (strides == nullptr || md_ndims == 0 || + md_fmt_kind != dnnl_format_kind_t::dnnl_blocked) + return true; + + dnnl_dims_t blocks = {0}; + int perm[DNNL_MAX_NDIMS] = {0}; + for (int d = 0; d < md_ndims; ++d) { + // no strides check needed for empty tensor + if (md_padded_dims[d] == 0) + return true; + + // no strides verification for runtime dims + if (strides[d] == DNNL_RUNTIME_DIM_VAL) + return true; + + perm[d] = d; + blocks[d] = 1; + } + + auto block_size = 1; + dnnl_dims_t md_inner_blks; + dnnl_dims_t md_blk_inner_idxs; + dnnl_memory_desc_query(md, dnnl_query_inner_idxs, &md_blk_inner_idxs); + dnnl_memory_desc_query(md, dnnl_query_inner_blks, &md_inner_blks); + for (int iblk = 0; iblk < md_inner_nblks; ++iblk) { + blocks[md_blk_inner_idxs[iblk]] *= md_inner_blks[iblk]; + block_size *= md_inner_blks[iblk]; + } + + // A custom comparator to yield linear order on perm + auto idx_sorter = [&](const int a, const int b) -> bool { + if (strides[a] == strides[b] && md_padded_dims[a] == md_padded_dims[b]) + return a < b; + else if (strides[a] == strides[b]) + return md_padded_dims[a] < md_padded_dims[b]; + else + return strides[a] < strides[b]; + }; + std::sort(perm, perm + md_ndims, idx_sorter); + + auto min_stride = block_size; + for (int idx = 0; idx < md_ndims; ++idx) { + const int d = perm[idx]; + + // Make an exception for strides[d] == 0 as it has broadcast semantics + // Note: owing to being sorted, these are the initial strides + if (strides[d] == 0) + continue; + else if (strides[d] < min_stride) + return false; + + // update min_stride for next iteration + const auto padded_dim = *md_padded_dims[d]; + min_stride = block_size * strides[d] * (padded_dim / blocks[d]); + } + return true; +} + +bool is_broadcast(const at::Tensor& t) { + for (int i = 0; i < t.dim(); i++) { + if (t.stride(i) == 0) + return true; + } + return false; +} + +bool is_onednn_matmul_strides( + const at::Tensor& tensor, + bool is_dst) { + // https://oneapi-src.github.io/oneDNN/dev_guide_matmul.html + // oneDNN matmul only support 2-dim and 3-dim + // 2D src(Mxk), wei(KxN), dst(MxN) + // 3D src(SxMxK), wei(WxKxN), dst(DxMxN) + auto sizes = tensor.sizes(); + auto tensor_dim = sizes.size(); + if (tensor_dim != 2 && tensor_dim != 3) + return false; + + if (tensor.is_contiguous()) + return true; + + // the overlaped cases are not supported + dnnl::memory::dims strides = get_onednn_strides(tensor); + int64_t storage_size = 1; + for (size_t dim = 0; dim < tensor_dim; ++dim) + storage_size += (sizes[dim] - 1) * strides[dim]; + if (storage_size < tensor.numel()) + return false; + + // the broadcast cases are not supported + if (is_broadcast(tensor)) { + return false; + } + + if (is_dst) { + // The memory format of the destination tensor should always + // be plain with n axis contiguous + if (strides[-1] != 1) + return false; + } else { + // the src and weight must have at least one of the axes + // m or k and n or k contiguous (i.e., stride=1) respectively. + if (strides[tensor_dim - 1] != 1 && strides[tensor_dim - 2] != 1) + return false; + } + + if (!onednn_strides_check(tensor)) + return false; + return true; +} + +bool is_broadcast_from_other_to_self( + const at::Tensor& self, + const at::Tensor& other) { + return ( + self.sizes() != other.sizes() && + at::is_expandable_to(other.sizes(), self.sizes())); +} + +at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim) { + TORCH_CHECK( + 3 == ndim || 4 == ndim || 5 == ndim, + "ndim must be 3, 4 or 5 when get cl tag"); + if (3 == ndim) { + return at::MemoryFormat::Contiguous; + } else if (5 == ndim) { + return at::MemoryFormat::ChannelsLast3d; + } else { + return at::MemoryFormat::ChannelsLast; + } +} + +bool binary_valid( + const at::Tensor& self, + const at::Tensor& other, + bool is_fusion) { + if (self.sizes() != other.sizes() && + !is_broadcast_from_other_to_self(self, other)) + return false; + + /* If the following conditions are satisfied, then oneDNN path will be + selected: + * 1. self and other should be xpu tensor and be defined. + * 2. self or other should not be scalar (wrapped tensor). + * 3. dim of self and other should be equal and must be larger than 0 and + smaller than 7. + * 4. the datatype should be supported by oneDNN primitive. + * 5. self and other should be in the same datatype. + * 6. self and other should be contiguous or channel-last contiguous.*/ + + + // 1. self and other should be xpu tensor and be defined. + if ((!self.defined()) || (!other.defined()) || (!self.is_xpu()) || + (!other.is_xpu())) + return false; + + // 2. self or other should not be scalar (wrapped tensor). + if (self.unsafeGetTensorImpl()->is_wrapped_number() || other.unsafeGetTensorImpl()->is_wrapped_number()) + return false; + + // 3. dim of self and other should be equal and must be larger than 0 and + // smaller than 7. + if ((self.dim() <= 0) || (other.dim() <= 0) || (self.dim() != other.dim()) || + (self.dim() > 6) || (other.dim() > 6)) + return false; + + // 4. the datatype should be supported by oneDNN primitive. + switch (self.scalar_type()) { + case at::ScalarType::Char: + break; + case at::ScalarType::Byte: + break; + case at::ScalarType::Half: + break; + case at::ScalarType::Float: + break; + case at::ScalarType::BFloat16: + break; + default: + return false; + }; + + // 5. datatype check + if (is_fusion) { + // for fusion case, the fusion can be performed on scalar_type or Float + // datatype. + if (self.scalar_type() != other.scalar_type() && + other.scalar_type() != at::ScalarType::Float) { + return false; + } + } else { + if (self.scalar_type() != other.scalar_type()) { + // for non-fusion case: self and other should be in the same datatype. + return false; + } + } + + // 6. self and other should be contiguous or channel-last contiguous. + const auto ndim = self.ndimension(); + auto cl_tag = at::MemoryFormat::ChannelsLast; + if (3 == ndim || 4 == ndim || 5 == ndim) { + cl_tag = get_cl_tag_by_ndim(ndim); + } + if ((self.is_contiguous() && other.is_contiguous()) || + (self.is_contiguous(cl_tag) && other.is_contiguous(cl_tag))) + return true; + return false; +} + +static inline bool is_channels_last(at::MemoryFormat fmt){ + return (at::MemoryFormat::ChannelsLast == fmt) || (at::MemoryFormat::ChannelsLast3d == fmt); +} + +static inline bool is_smf_channels_last(const Tensor& t){ + return is_channels_last(t.suggest_memory_format()); +} + +bool use_channels_last_for_conv( + const at::Tensor& src, + const at::Tensor& weight, + bool is_transpose){ + + if (!src.defined() || src.is_sparse()) { + // suggest channels_first + return false; + } + + auto suggest_channels_last_format = + (is_smf_channels_last(src) || is_smf_channels_last(weight)); + if (suggest_channels_last_format) { + // suggest channels_last + return true; + } + + return false; +} + +} diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h new file mode 100644 index 0000000000000..2929d3159e139 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/Utils.h @@ -0,0 +1,61 @@ +#pragma once +#include +#include +#include +#include + +#include +#include +#include +#include +#include + + +#define ONEDNN_SUPPORT_DETERMINISTIC (DNNL_VERSION_MAJOR >=3 && DNNL_VERSION_MINOR >=4) + +namespace at::native::onednn { + +dnnl::memory::format_tag get_dnnl_default_format( + int ndims, + bool is_channels_last = false, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype( + const at::Tensor& tensor, + bool allow_undef = false); + +dnnl::memory::data_type get_onednn_dtype_include_double( + const at::Tensor& tensor, + bool allow_undef = false); + +bool is_supported_onednn_dtype(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_dims(const at::Tensor& tensor); + +dnnl::memory::dims get_onednn_strides(const at::Tensor& tensor); +dnnl::memory::desc get_onednn_md(const at::Tensor& tensor); + +bool onednn_strides_check(const at::Tensor& src); +bool is_broadcast(const at::Tensor& t); + +bool is_onednn_matmul_strides( + const at::Tensor& tensor, + bool is_dst = false); + +bool is_broadcast_from_other_to_self( + const at::Tensor& self, + const at::Tensor& other); + +at::MemoryFormat get_cl_tag_by_ndim(const int64_t ndim); + +bool binary_valid( + const at::Tensor& self, + const at::Tensor& other, + bool is_fusion = false); + +bool use_channels_last_for_conv( + const at::Tensor& src, + const at::Tensor& weight, + bool is_transpose); + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h new file mode 100644 index 0000000000000..0c219fc8c6db6 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNN.h @@ -0,0 +1,110 @@ +#pragma once + +#include +#include +#include +#include + +namespace at::native::onednn{ + +TORCH_API sycl::event matmul( + at::Tensor& result, + const at::Tensor& mat1, + const at::Tensor& mat2, + const at::Tensor& b_raw, + bool m2_trans, + Attr attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef diff_weight_aten_size, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +TORCH_API sycl::event convolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution( + at::Tensor& dst, + const at::Tensor& src, + const at::Tensor& weight, + const at::Tensor& bia, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dst_padding, + IntArrayRef dilation, + int64_t groups, + Attr& attr, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_data( + at::Tensor& diff_src, + const at::Tensor& diff_dst, + const at::Tensor& weight, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + bool bias_defined, + const std::vector& deps = {}); + +TORCH_API sycl::event deconvolution_backward_weights( + at::Tensor& diff_weight, + at::Tensor& diff_bia, + const at::Tensor& diff_dst, + const at::Tensor& src, + IntArrayRef stride, + IntArrayRef padding, + IntArrayRef dilation, + int64_t groups, + const std::vector& deps = {}); + +dnnl::memory::dims conv_dst_size( + int64_t ndim, + IntArrayRef src_tz, + IntArrayRef wgh_tz, + IntArrayRef padding_front_top_left, + IntArrayRef padding_back_bottom_right, + IntArrayRef stride, + IntArrayRef dilation); + +dnnl::memory::dims deconv_dst_size( + IntArrayRef src_size, + IntArrayRef wgh_size, + IntArrayRef padding, + IntArrayRef stride, + IntArrayRef dilation, + IntArrayRef dst_padding, + int64_t groups); + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp new file mode 100644 index 0000000000000..9bec64c8c0248 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.cpp @@ -0,0 +1,27 @@ +#include +#include + +/* * + * Do NOT put any kernels or call any device binaries here! + * Only maintain oneDNN runtime states in this file. + * */ +namespace at::native::onednn { + +using namespace dnnl; + +GpuEngineManager& GpuEngineManager::Instance() { + static GpuEngineManager myInstance; + return myInstance; +} + +GpuStreamManager& GpuStreamManager::Instance() { + static thread_local GpuStreamManager myInstance; + return myInstance; +} + +bool set_onednn_verbose(int level) { + dnnl::status rs = dnnl::set_verbose(level); + return rs == dnnl::status::success; +} + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h new file mode 100644 index 0000000000000..c7e7a5e94b406 --- /dev/null +++ b/aten/src/ATen/native/mkldnn/xpu/detail/oneDNNContext.h @@ -0,0 +1,75 @@ +#pragma once + +#include + +#include +#include +#include + +#include +#include +#include + +namespace at::native::onednn { + +TORCH_API dnnl::memory make_onednn_memory( + dnnl::memory::desc md, + dnnl::engine& engine, + void* ptr); + +// Keep non-static and non-inline +bool set_onednn_verbose(int level); + +// GpuEngineManager singleton +struct TORCH_API GpuEngineManager { + static GpuEngineManager& Instance(); // Singleton + + dnnl::engine& get_engine(const Device& device) { + TORCH_INTERNAL_ASSERT(device.type() == kXPU); + TORCH_INTERNAL_ASSERT(device.index() < c10::xpu::device_count()); + return *engine_pool[device.index()]; + } + + GpuEngineManager(GpuEngineManager const&) = delete; + GpuEngineManager& operator=(GpuEngineManager const&) = delete; + + protected: + GpuEngineManager() { + int device_count = (int)c10::xpu::device_count(); + TORCH_INTERNAL_ASSERT(device_count > 0); + for (int i = 0; i < device_count; i++) { + engine_pool.push_back( + std::make_shared(dnnl::sycl_interop::make_engine( + c10::xpu::get_raw_device(i), c10::xpu::get_device_context() + ))); + } + } + ~GpuEngineManager() {} + + private: + std::vector> engine_pool; +}; + +// GpuStreamManager singleton +struct TORCH_API GpuStreamManager { + static GpuStreamManager& Instance(); // Singleton + + dnnl::stream get_stream() { + c10::DeviceIndex device_index = c10::xpu::current_device(); + TORCH_INTERNAL_ASSERT(device_index < c10::xpu::device_count()); + return dnnl::sycl_interop::make_stream( + GpuEngineManager::Instance().get_engine({c10::kXPU, device_index}), + c10::xpu::getCurrentXPUStream(device_index).queue()); + } + + GpuStreamManager(GpuStreamManager const&) = delete; + GpuStreamManager& operator=(GpuStreamManager const&) = delete; + + protected: + GpuStreamManager() { + } + ~GpuStreamManager() {} + +}; + +} // namespace at::native::onednn diff --git a/aten/src/ATen/native/mps/MPSGraphSonomaOps.h b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h new file mode 100644 index 0000000000000..b4cf3ad5dbcc8 --- /dev/null +++ b/aten/src/ATen/native/mps/MPSGraphSonomaOps.h @@ -0,0 +1,53 @@ +#pragma once + +#include + +#if !defined(__MAC_14_0) && \ + (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0)) + +typedef NS_ENUM(NSUInteger, MPSGraphFFTScalingMode) +{ + MPSGraphFFTScalingModeNone = 0L, + MPSGraphFFTScalingModeSize = 1L, + MPSGraphFFTScalingModeUnitary = 2L, +}; + +@interface FakeMPSGraphFFTDescriptor : NSObject +@property (readwrite, nonatomic) BOOL inverse; +@property (readwrite, nonatomic) MPSGraphFFTScalingMode scalingMode; +@property (readwrite, nonatomic) BOOL roundToOddHermitean; ++(nullable instancetype) descriptor; +@end + +@compatibility_alias MPSGraphFFTDescriptor FakeMPSGraphFFTDescriptor; + +@interface MPSGraph (SonomaOps) +-(MPSGraphTensor * _Nonnull) conjugateWithTensor:(MPSGraphTensor * _Nonnull) tensor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) realPartOfTensor:(MPSGraphTensor * _Nonnull) tensor + name:(NSString * _Nullable) name; + + +-(MPSGraphTensor * _Nonnull) fastFourierTransformWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) realToHermiteanFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; + +-(MPSGraphTensor * _Nonnull) HermiteanToRealFFTWithTensor:(MPSGraphTensor * _Nonnull) tensor + axes:(NSArray * _Nonnull) axes + descriptor:(MPSGraphFFTDescriptor * _Nonnull) descriptor + name:(NSString * _Nullable) name; +@end + +// define BFloat16 enums for MacOS13 +#define MPSDataTypeBFloat16 ((MPSDataType) (MPSDataTypeAlternateEncodingBit | MPSDataTypeFloat16)) + +// define Metal version +#define MTLLanguageVersion3_1 ((MTLLanguageVersion) ((3 << 16) + 1)) +#endif diff --git a/aten/src/ATen/native/mps/OperationUtils.h b/aten/src/ATen/native/mps/OperationUtils.h index d47d77b819e50..3e812d0718dcc 100644 --- a/aten/src/ATen/native/mps/OperationUtils.h +++ b/aten/src/ATen/native/mps/OperationUtils.h @@ -2,6 +2,7 @@ #pragma once +#include #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include @@ -46,11 +47,13 @@ struct MPSScalar { at::Half h; int64_t i; bool b; + c10::complex cf; + c10::complex ch; + at::BFloat16 bf16; } value {}; }; -void runMPSGraph( - MPSStream* mpsStream, +void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds, NSDictionary* results); @@ -69,10 +72,13 @@ static inline std::string getMPSTypeString(const Tensor& t, bool short_name = fa return getMPSTypeString(t.scalar_type(), short_name); } std::string scalarToMetalTypeString(const c10::ScalarType& scalar_type); +static inline std::string scalarToMetalTypeString(const Tensor& t) { + return scalarToMetalTypeString(t.scalar_type()); +} NSArray* getTensorAxes(const Tensor& t); NSArray* getTensorAxes(const IntArrayRef& sizes, at::OptionalIntArrayRef dim); std::string getMPSShapeString(MPSShape* shape); -std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true); +std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype = true, bool exclude_shape = false); std::string getArrayRefString(const IntArrayRef s); // use has_storage() on the returned tensor to determine if src actually is a view Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst); @@ -327,6 +333,30 @@ inline bool is_dense_in_storage(const at::Tensor& t) { return compute_storage_numel_distance(t) == static_cast(t.numel()); } + +class MetalShaderLibrary { +public: + MetalShaderLibrary(const std::string& src, unsigned nparams_ = 0): shaderSource(src), nparams(nparams_) {} + MetalShaderLibrary(const MetalShaderLibrary&) = delete; + inline id getPipelineStateForFunc(const std::string& fname) { + return getLibraryPipelineState(getLibrary(), fname); + } + id getPipelineStateForFunc(const std::string& fname, const std::initializer_list& params) { + return getLibraryPipelineState(getLibrary(params), fname); + } +private: + id getLibraryPipelineState(id lib, const std::string& fname); + id getLibrary(); + id getLibrary(const std::initializer_list& params); + + id compileLibrary(const std::string& src); + std::string shaderSource; + unsigned nparams; + id library = nil; + std::unordered_map> libMap; + std::unordered_map> cplMap; +}; + static inline void mtl_setBuffer(id encoder, const Tensor& t, unsigned idx) { [encoder setBuffer:getMTLBufferStorage(t) offset:t.storage_offset() * t.element_size() @@ -344,4 +374,53 @@ static inline void mtl_dispatch1DJob(id encoder, id generateKernelDataOffsets(id commandEncoder, const TensorIteratorBase& iter, bool use_64bit_index = false); +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1) { + return @{ p1.getMPSGraphTensor(): p1.getMPSGraphTensorData() }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(), + }; +} + +inline NSDictionary* dictionaryFromPlaceholders(Placeholder& p1, Placeholder& p2, Placeholder& p3, Placeholder& p4) { + return @{ + p1.getMPSGraphTensor(): p1.getMPSGraphTensorData(), + p2.getMPSGraphTensor(): p2.getMPSGraphTensorData(), + p3.getMPSGraphTensor(): p3.getMPSGraphTensorData(), + p4.getMPSGraphTensor(): p4.getMPSGraphTensorData(), + }; +} + +inline void runMPSGraph(MPSStream* stream, MPSGraph* graph, NSDictionary* feeds, Placeholder& result) { + runMPSGraph(stream, graph, feeds, dictionaryFromPlaceholders(result)); +} + +inline bool supportsComplex() { + return is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS); +} + +// MPS yet to support double types, but starting from MacOS 14, supports bfloat16 +inline bool supportedFloatingType(ScalarType dtype) { + return dtype == kFloat || dtype == kHalf || dtype == kBFloat16; +} + +inline bool supportedFloatingType(const Tensor& t) { + return supportedFloatingType(t.scalar_type()); +} + +inline bool needsGather(const Tensor& t) { + return !t.is_contiguous() || t.storage_offset(); +} + } // namespace at::native::mps diff --git a/aten/src/ATen/native/mps/OperationUtils.mm b/aten/src/ATen/native/mps/OperationUtils.mm index ef651e784e0fd..8170bd0047397 100644 --- a/aten/src/ATen/native/mps/OperationUtils.mm +++ b/aten/src/ATen/native/mps/OperationUtils.mm @@ -3,8 +3,10 @@ #include #include #include +#include #include #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -48,12 +50,24 @@ void runMPSGraph(MPSStream* mpsStream, MPSGraph* mpsGraph, NSDictionary* feeds, mpsStream->executeMPSGraph(mpsGraph, feeds, results, SyncType::COMMIT_ADAPTIVE); } +static inline void checkSupportsComplex() { + TORCH_CHECK_TYPE(supportsComplex(), "MPS complex types are only supported on MacOS 14.0 or newer."); +} + +static inline void checkSupportsBFloat16() { + TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), + "MPS bfloat16 type is supported on MacOS 14.0 or newer."); +} + MPSDataType getMPSDataType(ScalarType scalar_type) { switch (scalar_type) { case ScalarType::Float: return MPSDataTypeFloat32; case ScalarType::Half: return MPSDataTypeFloat16; + case ScalarType::BFloat16: + checkSupportsBFloat16(); + return MPSDataTypeBFloat16; case ScalarType::Int: return MPSDataTypeInt32; case ScalarType::Long: @@ -71,12 +85,10 @@ MPSDataType getMPSDataType(ScalarType scalar_type) { "Cannot convert a float64 Tensor to MPS as the MPS framework doesn't support float64. " "Please use float32 instead.") case ScalarType::ComplexHalf: - TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), - "MPS complex types are only supported on MacOS 14.0 or newer."); + checkSupportsComplex(); return MPSDataTypeComplexFloat16; case ScalarType::ComplexFloat: - TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), - "MPS complex types are only supported on MacOS 14.0 or newer."); + checkSupportsComplex(); return MPSDataTypeComplexFloat32; default: TORCH_CHECK_TYPE( @@ -132,6 +144,9 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { return MPSDataTypeFloat32; case ScalarType::Half: return MPSDataTypeFloat16; + case ScalarType::BFloat16: + checkSupportsBFloat16(); + return MPSDataTypeBFloat16; case ScalarType::Int: return MPSDataTypeInt32; case ScalarType::Long: @@ -145,12 +160,13 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { case ScalarType::Bool: return MPSDataTypeBool; case ScalarType::ComplexHalf: - TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), - "MPS complex types are only supported on MacOS 14.0 or newer."); + checkSupportsComplex(); return MPSDataTypeComplexFloat16; + // This is an intentional fallthrough supporting ComplexDouble for Scalar + // types as they are casted to Complex64 currently. + case ScalarType::ComplexDouble: case ScalarType::ComplexFloat: - TORCH_CHECK_TYPE(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), - "MPS complex types are only supported on MacOS 14.0 or newer."); + checkSupportsComplex(); return MPSDataTypeComplexFloat32; default: TORCH_CHECK_TYPE( @@ -166,6 +182,8 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { return short_name ? "f32" : "Float32"; case ScalarType::Half: return short_name ? "f16" : "Float16"; + case ScalarType::BFloat16: + return short_name ? "bf16" : "BFloat16"; case ScalarType::Int: return short_name ? "i32" : "Int32"; case ScalarType::Long: @@ -193,6 +211,9 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { return "float"; case ScalarType::Half: return "half"; + case ScalarType::BFloat16: + checkSupportsBFloat16(); + return "bfloat"; case ScalarType::Int: return "int"; case ScalarType::Long: @@ -256,7 +277,7 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { return ss.str(); } -std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype) { +std::string getTensorsStringKey(const TensorList& tensors, bool short_dtype, bool exclude_shape) { std::string str; // The key format per tensor would look like ":Float32[1,1,1,10]:" for (const Tensor& tensor : tensors) { @@ -267,8 +288,12 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) { if (tensor.dim() == 0) { str += "Scalar"; } else { - const NSString* ns_shape_key = [[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","]; - str += std::string(ns_shape_key.UTF8String); + if (exclude_shape) { + str += "[-1]"; + } else { + str += + std::string([[getMPSShape(tensor) valueForKey:@"description"] componentsJoinedByString:@","].UTF8String); + } } str += "]"; } else { @@ -343,9 +368,8 @@ void printTensorNDArray(const Tensor& t) { TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!"); // extract the pointer to MTLBuffer from the Tensor's storage id srcBuf = getMTLBufferStorage(src); - bool sliceViewTensor = canSliceViewTensor(src, mpsShape); // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose()) - if ((!src.is_contiguous() || (src.storage_offset() && !sliceViewTensor)) && gatherTensorData) { + if (needsGather(src) && gatherTensorData) { Tensor emptyShell = Tensor(); // use "_tensor" from Placeholder to retain view's output during its usage in other ops _tensor = gatherViewTensor(src, emptyShell); @@ -361,19 +385,13 @@ void printTensorNDArray(const Tensor& t) { // if buffer size is zero in here, it's not a user error. It could be a missing check for // tensor.numel() == 0 in our internal implementations of ops. TORCH_INTERNAL_ASSERT([srcBuf length] > 0, "Placeholder tensor is empty!"); - const MPSDataType mpsDataType = dataType != MPSDataTypeInvalid ? dataType - : _tensor.dim() == 0 ? getMPSScalarType(_tensor.scalar_type()) - : getMPSDataType(_tensor.scalar_type()); - - if (src.is_contiguous() && src.storage_offset() && sliceViewTensor) { - _value = getMPSGraphTensorDataForView(src, mpsShape, mpsDataType); - } else { - if (!mpsShape) { - mpsShape = getMPSShape(_tensor); - } - - _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf shape:mpsShape dataType:mpsDataType] autorelease]; + if (dataType == MPSDataTypeInvalid) { + const auto scalar_type = _tensor.scalar_type(); + dataType = _tensor.dim() == 0 ? getMPSScalarType(scalar_type) : getMPSDataType(scalar_type); } + _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf + shape:mpsShape ? mpsShape : getMPSShape(_tensor) + dataType:dataType] autorelease]; TORCH_INTERNAL_ASSERT(_value); _placeholder = mpsGraphTensor; @@ -393,7 +411,7 @@ void printTensorNDArray(const Tensor& t) { MPSNDArray* emptyArray = [[[MPSNDArray alloc] initWithDevice:mpsStream->device() descriptor:desc] autorelease]; result = [[[MPSGraphTensorData alloc] initWithMPSNDArray:emptyArray] autorelease]; } - assert(result); + TORCH_INTERNAL_ASSERT(result); return result; } @@ -404,6 +422,8 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) { return {.value.f = scalar.to(), .size = sizeof(float), .type = type}; case ScalarType::Half: return {.value.h = scalar.to(), .size = sizeof(short), .type = type}; + case ScalarType::BFloat16: + return {.value.bf16 = scalar.to(), .size = sizeof(short), .type = type}; case ScalarType::Long: return {.value.i = scalar.to(), .size = sizeof(int64_t), .type = type}; case ScalarType::Int: @@ -416,6 +436,11 @@ MPSScalar getMPSScalar(const Scalar& scalar, ScalarType type) { return {.value.i = scalar.to(), .size = sizeof(uint8_t), .type = type}; case ScalarType::Bool: return {.value.b = scalar.to(), .size = sizeof(bool), .type = type}; + case ScalarType::ComplexHalf: + return {.value.ch = scalar.to>(), .size = sizeof(int32_t), .type = type}; + case ScalarType::ComplexFloat: + case ScalarType::ComplexDouble: + return {.value.cf = scalar.to>(), .size = sizeof(int64_t), .type = type}; default: TORCH_INTERNAL_ASSERT(false, "Unsupported scalar type '", type, "' on MPS backend."); } @@ -455,7 +480,7 @@ Tensor wrapped_scalar_tensor_mps(const Scalar& scalar, const Device device) { } else if (scalar.isComplex()) { tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kComplexDouble)); } else { - AT_ASSERT(scalar.isIntegral(false)); + TORCH_INTERNAL_ASSERT(scalar.isIntegral(false)); tensor = at::scalar_tensor(scalar, at::device(device).dtype(at::kLong)); } tensor.unsafeGetTensorImpl()->set_wrapped_number(true); @@ -518,7 +543,7 @@ string get_mem_format_string(c10::MemoryFormat memory_format) { mem_format_key = "ChannelsLast"; break; default: - assert(0 && "Invalid memory format\n"); + TORCH_CHECK(false, "Invalid memory format", memory_format); } return mem_format_key; @@ -587,4 +612,74 @@ void executeMPSAllocatorCallback(void* ptr, EventType event) override {} return kernelDataOffsets; } +id MetalShaderLibrary::getLibrary() { + if (C10_UNLIKELY(!library)) { + TORCH_INTERNAL_ASSERT(nparams == 0); + library = compileLibrary(shaderSource); + } + return library; +} + +id MetalShaderLibrary::getLibrary(const std::initializer_list& params) { + TORCH_INTERNAL_ASSERT(nparams == params.size()); + std::string key = ""; + for (auto p : params) { + key += ":" + p; + } + auto lib = libMap[key]; + if (lib) { + return lib; + } + auto it = params.begin(); + switch (nparams) { + case 1: + lib = compileLibrary(fmt::format(shaderSource, *it)); + break; + case 2: { + auto& first = *it++; + auto& second = *it; + lib = compileLibrary(fmt::format(shaderSource, first, second)); + break; + } + case 3: { + auto& first = *it++; + auto& second = *it++; + auto& third = *it; + lib = compileLibrary(fmt::format(shaderSource, first, second, third)); + break; + } + default: + TORCH_INTERNAL_ASSERT(false, "Unsupported number of paramaters ", nparams); + } + return libMap[key] = lib; +} + +id MetalShaderLibrary::compileLibrary(const std::string& src) { + NSError* error = nil; + MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; + [options setLanguageVersion:is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS) ? MTLLanguageVersion3_1 + : MTLLanguageVersion2_3]; + auto str = [NSString stringWithCString:src.c_str() encoding:NSASCIIStringEncoding]; + auto device = MPSDevice::getInstance()->device(); + library = [device newLibraryWithSource:str options:options error:&error]; + TORCH_CHECK(library, "Failed to create metal library, error: ", [[error description] UTF8String]); + return library; +} + +id MetalShaderLibrary::getLibraryPipelineState(id lib, const std::string& fname) { + auto key = fmt::format("{}:{}", reinterpret_cast(lib), fname); + auto cpl = cplMap[key]; + if (cpl) { + return cpl; + } + + NSError* error = nil; + id func = [lib newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]]; + TORCH_CHECK(func, "Failed to create function state object for: ", fname); + cpl = [[lib device] newComputePipelineStateWithFunction:func error:&error]; + TORCH_CHECK(cpl, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); + + return cplMap[key] = cpl; +} + } // namespace at::native::mps diff --git a/aten/src/ATen/native/mps/operations/Activation.mm b/aten/src/ATen/native/mps/operations/Activation.mm index a8ac52c2ec25e..da11401c948d3 100644 --- a/aten/src/ATen/native/mps/operations/Activation.mm +++ b/aten/src/ATen/native/mps/operations/Activation.mm @@ -41,10 +41,6 @@ #include #endif -#ifdef __OBJC__ -#include -#endif - using namespace at::mps; namespace at::native { @@ -53,6 +49,10 @@ Tensor relu_mps(const Tensor& self) { using namespace mps; using CachedGraph = MPSUnaryCachedGraph; + if (self.numel() == 0) { + return self; + } + MPSStream* stream = getCurrentMPSStream(); bool executeGatherOp = @@ -75,13 +75,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output; @@ -90,6 +85,10 @@ Tensor relu_mps(const Tensor& self) { Tensor& relu_mps_(Tensor& self) { using namespace mps; using CachedGraph = MPSUnaryCachedGraph; + + if (self.numel() == 0) { + return self; + } // Inplace relu Tensor& output = self; bool executeGatherOp = @@ -117,13 +116,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, executeGatherOp ? out : output, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); if (executeGatherOp) { output.copy_(out); } @@ -137,8 +131,17 @@ Tensor relu_mps(const Tensor& self) { using CachedGraph = MPSUnaryCachedGraph; TORCH_CHECK(output.is_mps()); + if (self.numel() == 0) { + return; + } + MPSStream* stream = getCurrentMPSStream(); + bool executeGatherOp = + !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) || + self.is_contiguous(MemoryFormat::ChannelsLast3d)); + Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve); + @autoreleasepool { string key = "leaky_relu" + getTensorsStringKey({self}) + ":" + to_string(negative_slope.to()); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { @@ -158,17 +161,16 @@ Tensor relu_mps(const Tensor& self) { newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp); + Placeholder outputPlaceholder = + Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); + } + if (executeGatherOp) { + output.copy_(output_); } } @@ -182,8 +184,14 @@ Tensor relu_mps(const Tensor& self) { using CachedGraph = MPSUnaryGradCachedGraph; TORCH_CHECK(output.is_mps()); + if (self.numel() == 0) { + return; + } + MPSStream* stream = getCurrentMPSStream(); + Tensor output_ = at::empty_like(self, self.suggest_memory_format()); + @autoreleasepool { string key = "leaky_relu_backward" + getTensorsStringKey({self, grad_output}) + ":" + to_string(negative_slope.to()); @@ -213,19 +221,13 @@ Tensor relu_mps(const Tensor& self) { Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); - Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, output_); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } + output.copy_(output_); } TORCH_IMPL_FUNC(log_softmax_mps_out) @@ -266,13 +268,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -312,15 +309,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder resultPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradPlaceholder, outputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, resultPlaceholder); } } @@ -363,13 +353,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (executeGatherOp) { @@ -447,15 +432,8 @@ Tensor relu_mps(const Tensor& self) { Placeholder(cachedGraph->gradInputTensor_, executeGatherOp ? grad_input_ : grad_input, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (executeGatherOp) { @@ -507,16 +485,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, outputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -555,16 +525,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, outputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -609,14 +571,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -663,16 +619,8 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c Placeholder gradPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad); Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, gradInput); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradPlaceholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -728,6 +676,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c auto approximate_type = get_gelutype_enum(approximate); MPSStream* stream = getCurrentMPSStream(); + bool executeGatherOp = + !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) || + self.is_contiguous(MemoryFormat::ChannelsLast3d)); + Tensor output_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve); + @autoreleasepool { const auto key = "gelu_out_mps" + getTensorsStringKey({self}) + ":" + gelutype_to_string(approximate_type); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { @@ -744,16 +697,16 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp); + Placeholder outputPlaceholder = + Placeholder(cachedGraph->outputTensor_, executeGatherOp ? output_ : output, nil, false); - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); + } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + if (executeGatherOp) { + output.copy_(output_); } } @@ -763,8 +716,11 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c using CachedGraph = MPSUnaryGradCachedGraph; // Empty output - if (grad_input.numel() == 0) + if (self.numel() == 0) { return; + } + + Tensor grad_input_ = at::empty_like(self, self.suggest_memory_format()); auto approximate_type = get_gelutype_enum(approximate); MPSStream* stream = getCurrentMPSStream(); @@ -838,18 +794,12 @@ Tensor log_sigmoid_backward_mps(const Tensor& grad_output, const Tensor& self, c Placeholder gradPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad); Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); + Placeholder outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input_); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradPlaceholder.getMPSGraphTensor() : gradPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradPlaceholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } + grad_input.copy_(grad_input_); } static void elu_variants_out_mps(const Tensor& self, @@ -864,7 +814,7 @@ static void elu_variants_out_mps(const Tensor& self, auto resultMemFormat = result.suggest_memory_format(); bool executeGatherOp = !(self.is_contiguous(resultMemFormat) && result.is_contiguous(resultMemFormat)); Tensor out; - if (executeGatherOp && resultMemFormat == MemoryFormat::ChannelsLast) { + if (executeGatherOp) { out = at::empty_like(result, MemoryFormat::Contiguous); } @@ -923,18 +873,10 @@ static void elu_variants_out_mps(const Tensor& self, newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp); - Placeholder outputPlaceholder = - Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false); - - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : result, nil, false); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); if (out.has_storage()) { result.copy_(out); } @@ -1040,15 +982,8 @@ static void elu_variants_out_mps(const Tensor& self, Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, out.has_storage() ? out : grad_input, nil, false); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfOrResultPlaceholder.getMPSGraphTensor() : selfOrResultPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfOrResultPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); if (out.has_storage()) { grad_input.copy_(out); } @@ -1095,13 +1030,8 @@ static void elu_variants_out_mps(const Tensor& self, Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1167,16 +1097,8 @@ static void elu_variants_out_mps(const Tensor& self, Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = @{ - gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData(), - }; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } return grad_input; } @@ -1256,9 +1178,7 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar), cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar), }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1335,9 +1255,7 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int cachedGraph->betaTensor_ : getMPSGraphTensorFromScalar(stream, beta_scalar), cachedGraph->thresholdTensor_ : getMPSGraphTensorFromScalar(stream, threshold_scalar), }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -1357,6 +1275,11 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int MPSStream* stream = getCurrentMPSStream(); + bool executeGatherOp = + !(self.is_contiguous(MemoryFormat::Contiguous) || self.is_contiguous(MemoryFormat::ChannelsLast) || + self.is_contiguous(MemoryFormat::ChannelsLast3d)); + Tensor result_ = at::empty_like(self, executeGatherOp ? MemoryFormat::Contiguous : MemoryFormat::Preserve); + @autoreleasepool { string key = "mish_out_mps:" + getTensorsStringKey({self}); @@ -1373,16 +1296,15 @@ Tensor glu_backward_mps(const Tensor& grad_output, const Tensor& self, const int newCachedGraph->inputTensor_ = inputTensor; newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, nil, executeGatherOp); + Placeholder outputPlaceholder = + Placeholder(cachedGraph->outputTensor_, executeGatherOp ? result_ : result, nil, false); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); + } + if (executeGatherOp) { + result.copy_(result_); } } @@ -1445,14 +1367,8 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) { Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); return grad_input; } } @@ -1518,9 +1434,7 @@ Tensor mish_backward_mps(const Tensor& grad_output, const Tensor& self) { selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), cachedGraph->lambdTensor_ : getMPSGraphTensorFromScalar(stream, lambd_scalar), }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1587,9 +1501,7 @@ static void shrink_backward_out_mps(const Tensor& grad_output, selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), cachedGraph->lambdTensor_ : getMPSGraphTensorFromScalar(stream, lambd_scalar), }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); return; } } @@ -1648,14 +1560,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, weightPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; } @@ -1720,16 +1626,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder gradInputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input); Placeholder weightedGradPlaceholder = Placeholder(cachedGraph->weightedGradTensor_, weight_grad); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = @{ - gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData(), - weightedGradPlaceholder.getMPSGraphTensor() : weightedGradPlaceholder.getMPSGraphTensorData() - }; + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder, weightPlaceholder); + auto results = dictionaryFromPlaceholders(gradInputPlaceholder, weightedGradPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } return std::tuple{grad_input, weight_grad}; @@ -1770,14 +1668,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1832,16 +1724,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -1881,14 +1765,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1943,13 +1821,8 @@ Tensor prelu_mps(const Tensor& self, const Tensor& weight_) { Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, gradOutputPlaceholder); + auto results = dictionaryFromPlaceholders(gradInputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } @@ -2033,14 +1906,8 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder); + auto results = dictionaryFromPlaceholders(gradInputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } @@ -2116,12 +1983,8 @@ Tensor hardtanh_backward_mps(const Tensor& grad_output, const Tensor& self, cons Placeholder(cachedGraph->outputTensor_, out.has_storage() ? out : output, nil, false); // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + auto results = dictionaryFromPlaceholders(outputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); if (out.has_storage()) { output.copy_(out); @@ -2218,15 +2081,8 @@ Tensor hardswish_backward_mps(const Tensor& grad_output, const Tensor& self) { Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, selfPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder); } return grad_input; } diff --git a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm index c88d468f7ed15..c38d5faec6a73 100644 --- a/aten/src/ATen/native/mps/operations/AdaptivePooling.mm +++ b/aten/src/ATen/native/mps/operations/AdaptivePooling.mm @@ -37,8 +37,9 @@ static void set_kernel_params(int64_t isizeH, if (isizeH >= osizeH) { if (check_avg_pooling) { - TORCH_CHECK((isizeH % osizeH == 0 && isizeW % osizeW == 0), - "Adaptive pool MPS: input sizes must be divisible by output sizes."); + TORCH_CHECK( + (isizeH % osizeH == 0 && isizeW % osizeW == 0), + "Adaptive pool MPS: input sizes must be divisible by output sizes. Non-divisible input sizes are not implemented on MPS device yet. For now, you can manually transfer tensor to cpu in this case. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/96056)"); } strideH = (int64_t)(isizeH / osizeH); strideW = (int64_t)(isizeW / osizeW); @@ -46,8 +47,9 @@ static void set_kernel_params(int64_t isizeH, kernel_sizeW = isizeW - (osizeW - 1) * strideW; } else { if (check_avg_pooling) { - TORCH_CHECK((osizeH % isizeH == 0 && osizeW % isizeW == 0), - "Adaptive pool MPS: output sizes must be divisible by input sizes."); + TORCH_CHECK( + (osizeH % isizeH == 0 && osizeW % isizeW == 0), + "Adaptive pool MPS: output sizes must be divisible by input sizes. Non-divisible input sizes are not implemented on MPS device yet. For now, you can manually transfer tensor to cpu in this case. Please refer to [this issue](https://github.com/pytorch/pytorch/issues/96056)"); } strideH = (int64_t)(osizeH / isizeH); strideW = (int64_t)(osizeW / isizeW); diff --git a/aten/src/ATen/native/mps/operations/BinaryKernel.mm b/aten/src/ATen/native/mps/operations/BinaryKernel.mm index 3b428d09c2d3f..409512a737971 100644 --- a/aten/src/ATen/native/mps/operations/BinaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/BinaryKernel.mm @@ -6,6 +6,8 @@ #include #include #include +// For MTLLanguageVersion_3_1 +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -22,7 +24,7 @@ namespace at::native { namespace mps { -static const char* METAL_BINARY = R"BINARY_METAL( +static MetalShaderLibrary lib(R"BINARY_METAL( #include using namespace metal; @@ -190,24 +192,25 @@ kernel void nextafter_kernel(constant void * input_ [[buffer(0)]], device void * out_ [[buffer(2)]], constant uint3 * offsets [[buffer(3)]], uint tid [[thread_position_in_grid]]) { - device T* out = (device T*)((device uint8_t*)out_ + offsets[tid].x); - constant T* input = (constant T*)((constant uint8_t*)input_ + offsets[tid].y); - constant T* other = (constant T*)((constant uint8_t*)other_ + offsets[tid].z); - - if (*input == *other) - { - *out = *other; - } - else if (isnan(*input) || isnan(*other)) - { + auto out = (device T*)((device uint8_t*)out_ + offsets[tid].x); + auto input = *(constant T*)((constant uint8_t*)input_ + offsets[tid].y); + auto other = *(constant T*)((constant uint8_t*)other_ + offsets[tid].z); +#if __METAL_VERSION__ >= 310 + *out = nextafter(input, other); +#else + if (input == other) { + *out = input; + } else if (isnan(input) || isnan(other)) { *out = NAN; - } - else - { - U bits = as_type(*input); - bits = bits + ((*other > *input) ? 1 : -1); + } else if (input == 0) { + constexpr auto one = as_type(static_cast(1)); + *out = other > 0 ? one : -one; + } else { + U bits = as_type(input); + (input > 0) ^ (input > other) ? bits++ : bits--; *out = as_type(bits); } +#endif } #define REGISTER_NEXTAFTER_OP(DTYPE, UTYPE) \ @@ -249,43 +252,7 @@ kernel void complex_kernel(constant void * real_ [[buffer(0)]], REGISTER_COMPLEX_OUT_OP(float); REGISTER_COMPLEX_OUT_OP(half); -)BINARY_METAL"; - -using namespace mps; - -static id compileBinaryOpsLibrary(id device) { - static id binaryLibrary = nil; - if (binaryLibrary) { - return binaryLibrary; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - binaryLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_BINARY encoding:NSASCIIStringEncoding] - options:options - error:&error]; - TORCH_CHECK(binaryLibrary, "Failed to create metal binary library, error: ", [[error description] UTF8String]); - return binaryLibrary; -} - -static id binaryPipelineState(id device, const std::string& kernel) { - static std::unordered_map> psoCache; - id pso = psoCache[kernel]; - if (pso) { - return pso; - } - - NSError* error = nil; - id binaryLib = compileBinaryOpsLibrary(device); - id binaryFunc = [binaryLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(binaryFunc, "Failed to create function state object for: ", kernel); - pso = [device newComputePipelineStateWithFunction:binaryFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); - - psoCache[kernel] = pso; - return pso; -} +)BINARY_METAL"); static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_name) { TORCH_CHECK(iter.common_dtype() != at::kDouble, "float64 is not supported on MPS"); @@ -302,10 +269,10 @@ static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_nam dispatch_sync_with_rethrow(mpsStream->queue(), ^() { @autoreleasepool { id computeEncoder = mpsStream->commandEncoder(); - const std::string kernel = func_name + "_" + scalarToMetalTypeString(input.scalar_type()); + const std::string kernel = func_name + "_" + scalarToMetalTypeString(input); auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter); - id binaryPSO = binaryPipelineState(device, kernel); + id binaryPSO = lib.getPipelineStateForFunc(kernel); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(binaryPSO, kernel, {input, other}); @@ -323,7 +290,7 @@ static void binary_mps_impl(TensorIteratorBase& iter, const std::string func_nam } void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& output) { - TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) && c10::isComplexType(other.scalar_type())); + TORCH_INTERNAL_ASSERT(c10::isComplexType(input.scalar_type()) || c10::isComplexType(other.scalar_type())); auto new_size = at::infer_size(input.sizes(), other.sizes()); if (!output.sizes().equals(new_size)) { output.resize_(new_size); @@ -332,9 +299,10 @@ void complex_mul_out(const Tensor& input, const Tensor& other, const Tensor& out if (length == 0) { return; } + auto common_dtype = output.scalar_type(); auto output_as_real = at::view_as_real(output).select(output.dim(), 0); - auto input_as_real = at::view_as_real(input).select(input.dim(), 0); - auto other_as_real = at::view_as_real(other).select(other.dim(), 0); + auto input_as_real = at::view_as_real(input.to(kMPS, common_dtype)).select(input.dim(), 0); + auto other_as_real = at::view_as_real(other.to(kMPS, common_dtype)).select(other.dim(), 0); auto iter = TensorIteratorConfig().add_output(output_as_real).add_input(input_as_real).add_input(other_as_real).build(); diff --git a/aten/src/ATen/native/mps/operations/BinaryOps.mm b/aten/src/ATen/native/mps/operations/BinaryOps.mm index 4c96954ef4aeb..a225ab83028d2 100644 --- a/aten/src/ATen/native/mps/operations/BinaryOps.mm +++ b/aten/src/ATen/native/mps/operations/BinaryOps.mm @@ -53,6 +53,14 @@ #define BinaryOpFn(graph, primary, secondary) \ MPSGraphTensor*(mps::BinaryOpCachedGraph * graph, MPSGraphTensor * primary, MPSGraphTensor * secondary) +static inline Tensor legacy_complex_as_view(const Tensor& t) { + // Convert non-complex types (and cdouble CPU scalars) to cfloat + if (!isComplexType(t.scalar_type()) || t.scalar_type() == kComplexDouble) { + return at::view_as_real(t.to(kMPS, kComplexFloat)); + } + return at::view_as_real(t.dim() != 0 ? t : t.to(kMPS)); +} + // alpha is always 1.0 except when this function is called from add_sub_lerp_template() static void binaryOpTensor(const Tensor& self, const Tensor& other, @@ -69,7 +77,8 @@ static void binaryOpTensor(const Tensor& self, "MPS: ", op_name, " op with int64 input is supported natively starting from macOS 13.2"); - TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()), "Complex types are unsupported on MPS"); + TORCH_CHECK_TYPE(!isComplexType(self.scalar_type()) || mps::supportsComplex(), + "Complex types are supported starting from MacOS 14.0+"); MPSStream* mpsStream = getCurrentMPSStream(); const bool is_self_scalar = self.dim() == 0; @@ -88,7 +97,7 @@ static void binaryOpTensor(const Tensor& self, Tensor output = output_; bool needsCopyToOutput = false; - if (!output_.is_contiguous() || (output_.is_view() && (self.is_alias_of(output_) || other.is_alias_of(output_)))) { + if (needsGather(output_) || (output_.is_view() && (self.is_alias_of(output_) || other.is_alias_of(output_)))) { output = at::empty(output_.sizes(), output_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); needsCopyToOutput = true; } @@ -184,9 +193,7 @@ static void binaryOpTensor(const Tensor& self, } Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, needsCopyToOutput ? output : output_); - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results); + runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder); if (needsCopyToOutput) { output_.copy_(output); @@ -390,7 +397,7 @@ static void add_sub_lerp_template(const Tensor& self, CREATE_MPS_BINARY_COMPARISON_OP_FUNC(logical_xor_out_mps, logicalXOR, Tensor); TORCH_IMPL_FUNC(mul_out_mps)(const Tensor& self, const Tensor& other, const Tensor& output) { - if (c10::isComplexType(self.scalar_type()) || c10::isComplexType(other.scalar_type())) { + if (!mps::supportsComplex() && (c10::isComplexType(self.scalar_type()) || c10::isComplexType(other.scalar_type()))) { return mps::complex_mul_out(self, other, output); } mps::binaryOpTensor( @@ -420,19 +427,27 @@ static void add_sub_lerp_template(const Tensor& self, } TORCH_IMPL_FUNC(add_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) { - if (isComplexType(self.scalar_type()) && isComplexType(other.scalar_type()) && !alpha.isComplex()) { + if ((isComplexType(self.scalar_type()) || isComplexType(other.scalar_type())) && !alpha.isComplex() && + !mps::supportsComplex()) { // Complex add with non-complex alpha is just add over views - return mps::add_sub_lerp_template( - at::view_as_real(self), at::view_as_real(other), alpha, at::view_as_real(output), "add"); + return mps::add_sub_lerp_template(mps::legacy_complex_as_view(self), + mps::legacy_complex_as_view(other), + alpha, + mps::legacy_complex_as_view(output), + "add"); } mps::add_sub_lerp_template(self, other, alpha, output, "add"); } TORCH_IMPL_FUNC(sub_out_mps)(const Tensor& self, const Tensor& other, const Scalar& alpha, const Tensor& output) { - if (isComplexType(self.scalar_type()) && isComplexType(other.scalar_type()) && !alpha.isComplex()) { + if ((isComplexType(self.scalar_type()) || isComplexType(other.scalar_type())) && !alpha.isComplex() && + !mps::supportsComplex()) { // Complex sub with non-complex alpha is just add over views - return mps::add_sub_lerp_template( - at::view_as_real(self), at::view_as_real(other), alpha, at::view_as_real(output), "sub"); + return mps::add_sub_lerp_template(mps::legacy_complex_as_view(self), + mps::legacy_complex_as_view(other), + alpha, + mps::legacy_complex_as_view(output), + "sub"); } mps::add_sub_lerp_template(self, other, alpha, output, "sub"); } diff --git a/aten/src/ATen/native/mps/operations/BitwiseOps.mm b/aten/src/ATen/native/mps/operations/BitwiseOps.mm index 58a9e711b6322..f243b06ba5e9f 100644 --- a/aten/src/ATen/native/mps/operations/BitwiseOps.mm +++ b/aten/src/ATen/native/mps/operations/BitwiseOps.mm @@ -12,7 +12,7 @@ namespace at::native { namespace mps { -static const char* BITWISE_OPS_TEMPLATE = R"METAL( +static MetalShaderLibrary lib(R"METAL( kernel void bitwise_and_tensor(constant uint& length [[buffer(0)]], device {0} *out [[buffer(1)]], @@ -90,7 +90,8 @@ kernel void bitwise_not(constant uint& length [[buffer(0)]], }} out[offset] = ~a[offset]; }} -)METAL"; +)METAL", + 3); static const std::string& getMetalType(const c10::ScalarType& t) { // Mapping from c10::ScalarType to integral type that can be used for bitwise ops @@ -117,48 +118,12 @@ kernel void bitwise_not(constant uint& length [[buffer(0)]], return getMetalType(s.type()); } -static id compileBitwiseOpsLibrary(id device, - const std::string& t1, - const std::string& t2, - const std::string& t3) { - auto key = t1 + t2 + t3; - static std::unordered_map> libMap; - auto it = libMap.find(key); - if (it != libMap.end()) { - return it->second; - } - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - auto rc = - [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(BITWISE_OPS_TEMPLATE, t1, t2, t3).c_str()] - options:options - error:&error]; - TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]); - libMap[key] = rc; - return rc; -} - -static id getCPLState(id device, - const std::string& t1, - const std::string& t2, - const std::string& t3, +template +static id getCPLState(const Tensor& t1, + const Tensor& t2, + const ScalarOrTensor& t3, const std::string& fname) { - auto key = t1 + t2 + t3 + fname; - static std::unordered_map> cplMap; - auto it = cplMap.find(key); - if (it != cplMap.end()) { - return it->second; - } - NSError* error = nil; - auto library = compileBitwiseOpsLibrary(device, t1, t2, t3); - id func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]]; - TORCH_CHECK(func != nil, "Can't get function ", fname); - auto rc = [device newComputePipelineStateWithFunction:func error:&error]; - TORCH_CHECK( - rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]); - cplMap[key] = rc; - return rc; + return lib.getPipelineStateForFunc(fname, {getMetalType(t1), getMetalType(t2), getMetalType(t3)}); } static void handle_tensor_tensor_binary_op(const Tensor& self, @@ -167,8 +132,7 @@ static void handle_tensor_tensor_binary_op(const Tensor& self, const std::string& kernel_name) { using namespace at::mps; MPSStream* stream = getCurrentMPSStream(); - id cplState = getCPLState( - MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(other), kernel_name); + auto cplState = getCPLState(output, self, other, kernel_name); uint32_t length = output.numel(); if (length == 0) { return; @@ -198,8 +162,7 @@ static void handle_tensor_scalar_binary_op(const Tensor& self, const std::string& kernel_name) { using namespace at::mps; MPSStream* stream = getCurrentMPSStream(); - id cplState = getCPLState( - MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(other), kernel_name); + auto cplState = getCPLState(output, self, other, kernel_name); uint64_t sval = other.to(); uint32_t length = output.numel(); if (length == 0) { @@ -236,7 +199,7 @@ static void _bitwise_op_out_mps(const Tensor& self, auto output_size = at::infer_size_dimvector(self.sizes(), other.sizes()); resize_output(output, output_size); - if (!output.is_contiguous()) { + if (needsGather(output)) { output = output.contiguous(); needs_output_copy = true; } @@ -277,7 +240,7 @@ static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) { bool needs_output_copy = false; resize_output(output, self.sizes()); - if (!output.is_contiguous()) { + if (needsGather(output)) { output = output.contiguous(); needs_output_copy = true; } @@ -296,8 +259,7 @@ static void _bitwise_not_out_mps(const Tensor& self, const Tensor& output_) { } using namespace at::mps; MPSStream* stream = getCurrentMPSStream(); - id cplState = getCPLState( - MPSDevice::getInstance()->device(), getMetalType(output), getMetalType(self), getMetalType(self), "bitwise_not"); + auto cplState = getCPLState(output, self, self, "bitwise_not"); dispatch_sync(stream->queue(), ^() { getMPSProfiler().beginProfileKernel(cplState, "bitwise_not", {self}); diff --git a/aten/src/ATen/native/mps/operations/Blas.mm b/aten/src/ATen/native/mps/operations/Blas.mm index 74cc252ddb3e9..1714a8e7e2f88 100644 --- a/aten/src/ATen/native/mps/operations/Blas.mm +++ b/aten/src/ATen/native/mps/operations/Blas.mm @@ -102,15 +102,8 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) { Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output; @@ -188,10 +181,7 @@ Tensor dot_mps(const Tensor& self, const Tensor& other) { feeds[selfPlaceholder.getMPSGraphTensor()] = selfPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; diff --git a/aten/src/ATen/native/mps/operations/Bucketization.mm b/aten/src/ATen/native/mps/operations/Bucketization.mm index 52696dc019179..6f725e851af67 100644 --- a/aten/src/ATen/native/mps/operations/Bucketization.mm +++ b/aten/src/ATen/native/mps/operations/Bucketization.mm @@ -17,7 +17,7 @@ namespace at::native { namespace mps { -static const char* METAL_BUCKETIZATION = R"BUCKETIZE_METAL( +static MetalShaderLibrary lib(R"BUCKETIZE_METAL( #include using namespace metal; @@ -194,44 +194,7 @@ kernel void searchsorted( REGISTER_SEARCHSORTED_OP(long, int); REGISTER_SEARCHSORTED_OP(long, long); -)BUCKETIZE_METAL"; - -static id compileBucketizationOpsLibrary(id device) { - static id bucketizationLibrary = nil; - if (bucketizationLibrary) { - return bucketizationLibrary; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - bucketizationLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_BUCKETIZATION - encoding:NSASCIIStringEncoding] - options:options - error:&error]; - TORCH_CHECK( - bucketizationLibrary, "Failed to create metal bucketization library, error: ", [[error description] UTF8String]); - return bucketizationLibrary; -} - -static id bucketizationPipelineState(id device, const std::string& kernel) { - static std::unordered_map> psoCache; - id pso = psoCache[kernel]; - if (pso) { - return pso; - } - - NSError* error = nil; - id bucketizationLib = compileBucketizationOpsLibrary(device); - id bucketizationFunc = - [bucketizationLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(bucketizationFunc, "Failed to create function state object for: ", kernel); - pso = [device newComputePipelineStateWithFunction:bucketizationFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); - - psoCache[kernel] = pso; - return pso; -} +)BUCKETIZE_METAL"); static void searchsorted_mps_contiguous(Tensor& result, const Tensor& input, @@ -250,15 +213,14 @@ static void searchsorted_mps_contiguous(Tensor& result, int64_t right_i64 = right; int64_t is_1d_boundaries = boundaries.dim() == 1; - id device = MPSDevice::getInstance()->device(); MPSStream* mpsStream = getCurrentMPSStream(); - dispatch_sync(mpsStream->queue(), ^() { + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { @autoreleasepool { id computeEncoder = mpsStream->commandEncoder(); - const std::string kernel = "searchsorted_" + scalarToMetalTypeString(input.scalar_type()) + "_" + - scalarToMetalTypeString(result.scalar_type()) + (sorter.defined() ? "_sorter" : ""); - id bucketizationPSO = mps::bucketizationPipelineState(device, kernel); + const std::string kernel = "searchsorted_" + scalarToMetalTypeString(input) + "_" + + scalarToMetalTypeString(result) + (sorter.defined() ? "_sorter" : ""); + id bucketizationPSO = lib.getPipelineStateForFunc(kernel); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(bucketizationPSO, kernel, {input, boundaries, sorter}); @@ -308,7 +270,7 @@ static void searchsorted_mps_contiguous(Tensor& result, return result; } - // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaing + // for non-contiguous result tensors, we write the output to a contiguous copy so we can later copy back, maintaining // the original result tensor Tensor out = result.contiguous(); diff --git a/aten/src/ATen/native/mps/operations/ConstantOps.mm b/aten/src/ATen/native/mps/operations/ConstantOps.mm index 52c74c3637e1a..2e7d0881bb60f 100644 --- a/aten/src/ATen/native/mps/operations/ConstantOps.mm +++ b/aten/src/ATen/native/mps/operations/ConstantOps.mm @@ -21,13 +21,14 @@ } Tensor output = self; bool needsCopyToOutput = false; - if (!self.is_contiguous() || self.storage_offset()) { + if (needsGather(self)) { output = at::empty(self.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); needsCopyToOutput = true; } struct CachedGraph : public MPSCachedGraph { CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* inputTensor_ = nil; MPSGraphTensor* outputTensor_ = nil; }; @@ -35,36 +36,23 @@ string key = "fill_scalar_mps_impl" + getTensorsStringKey(self) + ":" + to_string(value.toDouble()); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - auto isBool = self.scalar_type() == c10::ScalarType::Bool; - auto isUInt8 = self.scalar_type() == c10::ScalarType::Byte; - auto dataType = !isUInt8 ? !isBool ? getMPSScalarType(self.scalar_type()) : MPSDataTypeInt8 : MPSDataTypeUInt32; - // constantWithScalar does not work for boolTypes on MacOS-12.[34] - // workaround by filing it as int8 tensor and than casting to bool - // See https://github.com/pytorch/pytorch/issues/82427 - // constantWithScalar does not work for UInt8 Types on MacOS-12.[34]/Ventura preview - // workaround by filing it as uint32 tensor and than casting to uint8 - // See https://github.com/pytorch/pytorch/issues/83692 - MPSGraphTensor* inputTensor = [mpsGraph constantWithScalar:value.toDouble() - shape:getMPSShape(self) - dataType:dataType]; + MPSGraphTensor* inputTensor = mpsGraphScalarPlaceHolder(mpsGraph, getMPSDataType(self.scalar_type())); MPSGraphTensor* outputTensor = [mpsGraph identityWithTensor:inputTensor name:nil]; - if (isBool) { - outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeBool name:@"constWithBool-workaround"]; - } - if (isUInt8) { - outputTensor = [mpsGraph castTensor:outputTensor toType:MPSDataTypeUInt8 name:@"constWithUInt8-workaround"]; - } - + newCachedGraph->inputTensor_ = inputTensor; newCachedGraph->outputTensor_ = outputTensor; }); + auto mpsScalar = getMPSScalar(value, self.scalar_type()); + auto mpsScalarData = getMPSGraphTensorFromScalar(getCurrentMPSStream(), mpsScalar); + NSDictionary* feeds = @{cachedGraph->inputTensor_ : mpsScalarData}; + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, needsCopyToOutput ? output : self, nullptr, !needsCopyToOutput); NSDictionary* results = @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), /*feeds*/ nil, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); if (needsCopyToOutput) { self.copy_(output); diff --git a/aten/src/ATen/native/mps/operations/Convolution.mm b/aten/src/ATen/native/mps/operations/Convolution.mm index 4f262d1549fcb..fbf5a67262be2 100644 --- a/aten/src/ATen/native/mps/operations/Convolution.mm +++ b/aten/src/ATen/native/mps/operations/Convolution.mm @@ -318,10 +318,7 @@ static Tensor _mps_convolution_impl(const Tensor& input_t, feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return *output; @@ -486,15 +483,8 @@ static Tensor mps_convolution_backward_input(IntArrayRef input_size, auto weightsPlaceholder = Placeholder(cachedGraph->weightTensor_, weight_t); auto outputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, *grad_input); - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - weightsPlaceholder.getMPSGraphTensor() : weightsPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, weightsPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return *grad_input; } @@ -650,15 +640,8 @@ static Tensor mps_convolution_backward_weights(IntArrayRef weight_size, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->gradWeightTensor_, grad_weight_t); - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return grad_weight_t; diff --git a/aten/src/ATen/native/mps/operations/Copy.mm b/aten/src/ATen/native/mps/operations/Copy.mm index 8b1dd402e4f34..572582f5cb947 100644 --- a/aten/src/ATen/native/mps/operations/Copy.mm +++ b/aten/src/ATen/native/mps/operations/Copy.mm @@ -2,9 +2,15 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include #include +#include #include #include #include +#include +#include +#include +#include +#include namespace at::native { namespace mps { @@ -41,14 +47,22 @@ static void copy_cast_mps(at::Tensor& dst, MPSShape* srcShape = getMPSShape(src); @autoreleasepool { - string key = "copy_cast_mps" + getTensorsStringKey({src, dst}); + const bool needs_conj = src.is_conj() != dst.is_conj(); + string key = "copy_cast_mps" + getTensorsStringKey({src, dst}, true, /*exclude_shape*/ true) + ":" + + std::to_string(needs_conj); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, src); - MPSGraphTensor* inputCastTensor = inputTensor; + MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, srcDType); + auto outputTensor = inputTensor; if (isFloatingType(src.scalar_type()) && dstDType == MPSDataTypeUInt8) { - inputCastTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"]; + outputTensor = [mpsGraph castTensor:inputTensor toType:MPSDataTypeInt32 name:@"cast"]; + } + if (srcDType != dstDType) { + outputTensor = [mpsGraph castTensor:outputTensor toType:dstDType name:@"cast"]; + } + if (needs_conj) { + TORCH_CHECK(supportsComplex(), "MPS complex tensors conjugation needs MacOS14+"); + outputTensor = [mpsGraph conjugateWithTensor:outputTensor name:nil]; } - MPSGraphTensor* outputTensor = [mpsGraph castTensor:inputCastTensor toType:dstDType name:@"cast"]; newCachedGraph->inputTensor_ = inputTensor; newCachedGraph->outputTensor_ = outputTensor; @@ -72,12 +86,11 @@ static void copy_cast_mps(at::Tensor& dst, id device = MPSDevice::getInstance()->device(); MPSStream* stream = getCurrentMPSStream(); - Tensor dst; - Tensor src; + Tensor dst = dst_; + Tensor src = src_; + if (!dst_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) { dst = at::empty_like(dst_, LEGACY_CONTIGUOUS_MEMORY_FORMAT); - } else { - dst = dst_; } auto storage_byte_offset = src_.storage_offset() * src_.itemsize(); @@ -90,9 +103,8 @@ static void copy_cast_mps(at::Tensor& dst, src = src_.expand_as(dst).contiguous(); storage_byte_offset = src.storage_offset() * src.itemsize(); } - } else { - src = src_; } + id sourceBuffer = getMTLBufferStorage(src); size_t dst_tensor_nbytes = dst.nbytes(); @@ -110,28 +122,25 @@ static void copy_cast_mps(at::Tensor& dst, length:alignedLength options:options deallocator:nil]; - id tmpBuffer = sourceBuffer; - Tensor tmp; + id maybeCastedSourceBuffer = sourceBuffer; + Tensor maybeCastedSource; bool needsBlit = true; if (src_.dtype() != dst.dtype()) { if (destOffset == 0 && storage_byte_offset == 0) { // Return the casted tensor directly if there's no destination offset needsBlit = false; - tmpBuffer = destBuffer; + maybeCastedSourceBuffer = destBuffer; } else if (src.element_size() < dst.element_size()) { - tmp = at::empty(dst.sizes(), dst.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); - tmpBuffer = getMTLBufferStorage(tmp); + maybeCastedSource = at::empty(dst.sizes(), dst.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); + maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource); } - } - size_t size_to_copy = src.nbytes(); - // In case of dtype change, first convert src inplace - if (src_.dtype() != dst.dtype()) { - copy_cast_mps(dst, src, tmpBuffer, sourceBuffer, non_blocking); + // In case of dtype change, first convert src inplace + copy_cast_mps(dst, src, maybeCastedSourceBuffer, sourceBuffer, non_blocking); } if (needsBlit) { - size_to_copy = (size_to_copy / src.element_size()) * dst.element_size(); + const size_t size_to_copy = (src.nbytes() / src.element_size()) * dst.element_size(); // If there's anything wrong with source, we shouldn't return dst_ silently and must error out. TORCH_INTERNAL_ASSERT(sourceBuffer && dst_tensor_nbytes > 0); @@ -139,7 +148,7 @@ static void copy_cast_mps(at::Tensor& dst, getMPSProfiler().beginProfileCopy(sourceBuffer, destBuffer, src, dst, size_to_copy, non_blocking); stream->copy_and_sync( - tmpBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking, profile_id); + maybeCastedSourceBuffer, destBuffer, size_to_copy, storage_byte_offset, destOffset, non_blocking, profile_id); } [destBuffer release]; } @@ -227,7 +236,7 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { Tensor src; auto sameMemFormat = src_.is_contiguous(dst_.suggest_memory_format()) && dst_.is_contiguous(dst_.suggest_memory_format()); - const bool sameDataType = src_.dtype() == dst_.dtype(); + const bool sameDataType = src_.dtype() == dst_.dtype() && src_.is_conj() == dst_.is_conj(); if ((!src_.is_contiguous(MemoryFormat::Contiguous) && !sameMemFormat) || // the copy_cast path requires storage_offset to be applied before casting @@ -266,13 +275,32 @@ void copy_blit_mps(void* dst, const void* src, size_t size) { // for GPU to GPU copies we only encode to stream's command buffer (no flushing) stream->copy(sourceBuffer, destBuffer, src.nbytes(), src_byte_offset, dst_byte_offset, profile_id); } else { - if (dst_byte_offset) { - auto tmp = at::empty(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); - auto tmpBuffer = getMTLBufferStorage(tmp); - copy_cast_mps(tmp, src, tmpBuffer, sourceBuffer); - - uint64_t profile_id = getMPSProfiler().beginProfileCopy(tmpBuffer, destBuffer, tmp, dst_, dst_.nbytes(), true); - stream->copy(tmpBuffer, destBuffer, dst_.nbytes(), 0, dst_byte_offset, profile_id); + // Simulate cast to Complex on older MacOS by initializing real and imag parts + if (dst_.is_complex() && !supportsComplex()) { + if (!src.is_complex()) { + at::real(dst_).copy_(src); + at::imag(dst_).fill_(0); + } else if (src.is_conj() || dst_.is_conj()) { + // One cannot take view of conjugated tensor, but for some reason real and imag views are fine + // Use this to implement a conjugation + at::real(dst_).copy_(at::real(src)); + if (src.is_conj() != dst_.is_conj()) { + at::imag(dst_).copy_(at::neg(at::imag(src))); + } else { + at::imag(dst_).copy_(at::imag(src)); + } + } else { + at::view_as_real(dst_).copy_(at::view_as_real(src)); + } + } else if (dst_byte_offset) { + auto maybeCastedSource = + at::empty(dst_.sizes(), dst_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); + auto maybeCastedSourceBuffer = getMTLBufferStorage(maybeCastedSource); + copy_cast_mps(maybeCastedSource, src, maybeCastedSourceBuffer, sourceBuffer); + + uint64_t profile_id = getMPSProfiler().beginProfileCopy( + maybeCastedSourceBuffer, destBuffer, maybeCastedSource, dst_, dst_.nbytes(), true); + stream->copy(maybeCastedSourceBuffer, destBuffer, dst_.nbytes(), 0, dst_byte_offset, profile_id); } else { copy_cast_mps(dst_, src, destBuffer, sourceBuffer); } diff --git a/aten/src/ATen/native/mps/operations/CrossKernel.mm b/aten/src/ATen/native/mps/operations/CrossKernel.mm index 1e04a7633f1aa..69de4c5e78cc1 100644 --- a/aten/src/ATen/native/mps/operations/CrossKernel.mm +++ b/aten/src/ATen/native/mps/operations/CrossKernel.mm @@ -8,7 +8,10 @@ namespace at::native { namespace { -static const char* METAL_CROSS = R"CROSS_METAL( +using namespace mps; + +static MetalShaderLibrary lib(R"CROSS_METAL( +#include #include using namespace metal; @@ -75,44 +78,7 @@ kernel void cross(constant void * input_ [[buffer(0)]], REGISTER_CROSS_OP(uchar); REGISTER_CROSS_OP(bool); -)CROSS_METAL"; - -using namespace mps; - -static id compileCrossOpLibrary(id device) { - static id crossLibrary = nil; - if (crossLibrary) { - return crossLibrary; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - crossLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_CROSS encoding:NSASCIIStringEncoding] - options:options - error:&error]; - TORCH_CHECK(crossLibrary, "Failed to create metal cross library, error: ", [[error description] UTF8String]); - return crossLibrary; -} - -static id crossPipelineState(id device, ScalarType scalar_type) { - std::string kernel = "cross_" + scalarToMetalTypeString(scalar_type); - static std::unordered_map> psoCache; - id pso = psoCache[kernel]; - if (pso) { - return pso; - } - - NSError* error = nil; - id crossLib = compileCrossOpLibrary(device); - id crossFunc = [crossLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(crossFunc, "Failed to create function state object for: ", kernel); - pso = [device newComputePipelineStateWithFunction:crossFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); - - psoCache[kernel] = pso; - return pso; -} +)CROSS_METAL"); void cross_mps_impl(const Tensor& out, const Tensor& input, const Tensor& other, int64_t dim) { TORCH_CHECK(input.dtype() != at::kDouble, "float64 is not supported on MPS"); @@ -138,7 +104,7 @@ void cross_mps_impl(const Tensor& out, const Tensor& input, const Tensor& other, id computeEncoder = mpsStream->commandEncoder(); auto kernelDataOffsets = generateKernelDataOffsets(computeEncoder, iter); - id crossPSO = crossPipelineState(device, out.scalar_type()); + auto crossPSO = lib.getPipelineStateForFunc("cross_" + scalarToMetalTypeString(out)); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(crossPSO, "cross", {input, other}); diff --git a/aten/src/ATen/native/mps/operations/Distributions.mm b/aten/src/ATen/native/mps/operations/Distributions.mm index f8bb70086d5ff..7ed06c8bf4373 100644 --- a/aten/src/ATen/native/mps/operations/Distributions.mm +++ b/aten/src/ATen/native/mps/operations/Distributions.mm @@ -133,11 +133,7 @@ } Placeholder outputPlaceholder = Placeholder(cachedGraph->resultTensor, self); - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return self; @@ -235,7 +231,7 @@ scalar_type = ScalarType::Float; else if (scalar_type == ScalarType::ComplexHalf) scalar_type = ScalarType::Half; - AT_DISPATCH_FLOATING_TYPES_AND_HALF(scalar_type, "check_uniform_bounds", [&] { + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, scalar_type, "check_uniform_bounds", [&] { const auto min = static_cast(std::numeric_limits::lowest()); const auto max = static_cast(std::numeric_limits::max()); TORCH_CHECK(from <= to, "uniform_ expects to return a [from, to) range, but found from=", from, " > to=", to); @@ -325,17 +321,15 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optional= to=", to); if (isFloatingType(input_dtype)) { - AT_DISPATCH_FLOATING_TYPES_AND2( - at::ScalarType::Half, at::ScalarType::BFloat16, input_dtype, "random_update_from_to", [&] { - from = templates::update_from(from); - to = templates::update_to(to); - TORCH_CHECK( - from < to, - "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", - from, - " >= to=", - to); - }); + AT_DISPATCH_FLOATING_TYPES_AND2(kHalf, kBFloat16, input_dtype, "random_update_from_to", [&] { + from = templates::update_from(from); + to = templates::update_to(to); + TORCH_CHECK(from < to, + "random_mps_ expects 'from' casted to dtype to be less than 'to' casted to dtype, but got from=", + from, + " >= to=", + to); + }); templates::check_from_to_in_range(from, to - 1, self.dtype()); } } else if (from != std::numeric_limits::lowest()) { @@ -575,10 +569,7 @@ Tensor normal_mps(const Tensor& mean, const Tensor& std, c10::optionalstateTensor : stateTensorData, probPlaceholder.getMPSGraphTensor() : probPlaceholder.getMPSGraphTensorData() }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; diff --git a/aten/src/ATen/native/mps/operations/Eye.mm b/aten/src/ATen/native/mps/operations/Eye.mm index bdbb361a8a1e9..c7d682c3f22e0 100644 --- a/aten/src/ATen/native/mps/operations/Eye.mm +++ b/aten/src/ATen/native/mps/operations/Eye.mm @@ -98,9 +98,7 @@ // Create dictionary of inputs/feeds and outputs/results // In this case, there are no inputs, so the feeds are nil NSDictionary* feeds = nil; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; + auto results = dictionaryFromPlaceholders(outputPlaceholder); // Run the graph runMPSGraph(stream, cachedGraph->graph(), feeds, results); diff --git a/aten/src/ATen/native/mps/operations/FastFourierTransform.mm b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm new file mode 100644 index 0000000000000..21fb75bb2179e --- /dev/null +++ b/aten/src/ATen/native/mps/operations/FastFourierTransform.mm @@ -0,0 +1,179 @@ +#include +#include +#include + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif + +#if !defined(__MAC_14_0) && (!defined(MAC_OS_X_VERSION_14_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_14_0)) +@implementation FakeMPSGraphFFTDescriptor ++ (nullable instancetype)descriptor { + // Redispatch the constructor to the actual implementation + id desc = NSClassFromString(@"MPSGraphFFTDescriptor"); + return (FakeMPSGraphFFTDescriptor*)[desc descriptor]; +} + +- (nonnull id)copyWithZone:(nullable NSZone*)zone { + return self; +} +@end +#endif + +namespace at::native { +namespace { +MPSGraphFFTScalingMode normalization_to_ScalingMode(int64_t normalization) { + switch (static_cast(normalization)) { + case fft_norm_mode::none: + return MPSGraphFFTScalingModeNone; + case fft_norm_mode::by_n: + return MPSGraphFFTScalingModeSize; + case fft_norm_mode::by_root_n: + return MPSGraphFFTScalingModeUnitary; + default: + break; + } + TORCH_CHECK(false, "Unsupported normalization type", normalization); +} + +NSArray* IntArrayToNSArray(IntArrayRef arr) { + auto rc = [NSMutableArray arrayWithCapacity:arr.size()]; + for (const auto idx : c10::irange(arr.size())) { + rc[idx] = [NSNumber numberWithInteger:arr[idx]]; + } + return rc; +} + +} // anonymous namespace + +Tensor _fft_c2r_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, int64_t last_dim_size) { + TORCH_CHECK(self.is_complex()); + auto in_sizes = self.sizes(); + DimVector out_sizes(in_sizes.begin(), in_sizes.end()); + out_sizes[dim.back()] = last_dim_size; + auto out = at::empty(out_sizes, self.options().dtype(c10::toRealValueType(self.scalar_type()))); + return _fft_c2r_mps_out(self, dim, normalization, last_dim_size, out); +} + +Tensor _fft_r2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided) { + TORCH_CHECK(self.is_floating_point()); + auto input_sizes = self.sizes(); + DimVector out_sizes(input_sizes.begin(), input_sizes.end()); + auto last_dim = dim.back(); + auto last_dim_halfsize = (input_sizes[last_dim]) / 2 + 1; + if (onesided) { + out_sizes[last_dim] = last_dim_halfsize; + } + + auto out = at::empty(out_sizes, self.options().dtype(c10::toComplexType(self.scalar_type()))); + return _fft_r2c_mps_out(self, dim, normalization, onesided, out); +} + +Tensor _fft_c2c_mps(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward) { + TORCH_CHECK(self.is_complex()); + if (dim.empty()) { + return self.clone(); + } + auto out = at::empty(self.sizes(), self.options()); + return _fft_c2c_mps_out(self, dim, normalization, forward, out); +} + +using namespace mps; + +// TODO: Investigate numerical discrepancies see https://github.com/pytorch/pytorch/issues/120237 +Tensor& _fft_r2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool onesided, Tensor& out) { + TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); + auto key = __func__ + getTensorsStringKey({self, out}) + ":" + getArrayRefString(dim) + ":" + + std::to_string(normalization) + ":" + std::to_string(onesided); + @autoreleasepool { + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + auto descriptor = [MPSGraphFFTDescriptor descriptor]; + descriptor.scalingMode = normalization_to_ScalingMode(normalization); + MPSGraphTensor* outputTensor; + if (onesided) { + // Return only unique results: + outputTensor = [mpsGraph realToHermiteanFFTWithTensor:inputTensor + axes:IntArrayToNSArray(dim) + descriptor:descriptor + name:nil]; + } else { + // Return with Hermitean conjugate results: + auto useDataType = + (inputTensor.dataType == MPSDataTypeFloat16) ? MPSDataTypeComplexFloat16 : MPSDataTypeComplexFloat32; + auto cTensor = [mpsGraph castTensor:inputTensor toType:useDataType name:nil]; + outputTensor = [mpsGraph fastFourierTransformWithTensor:cTensor + axes:IntArrayToNSArray(dim) + descriptor:descriptor + name:nil]; + } + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + }); + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } + return out; +} + +Tensor& _fft_c2r_mps_out(const Tensor& self, + IntArrayRef dim, + int64_t normalization, + int64_t last_dim_size, + Tensor& out) { + TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); + auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" + + std::to_string(normalization) + ":" + std::to_string(last_dim_size); + @autoreleasepool { + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + auto descriptor = [MPSGraphFFTDescriptor descriptor]; + descriptor.scalingMode = normalization_to_ScalingMode(normalization); + auto outputTensor = [mpsGraph HermiteanToRealFFTWithTensor:inputTensor + axes:IntArrayToNSArray(dim) + descriptor:descriptor + name:nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + }); + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } + return out; +} + +Tensor& _fft_c2c_mps_out(const Tensor& self, IntArrayRef dim, int64_t normalization, bool forward, Tensor& out) { + TORCH_CHECK(supportsComplex(), "FFT operations are only supported on MacOS 14+"); + auto key = __func__ + getTensorsStringKey({self}) + ":" + getArrayRefString(dim) + ":" + + std::to_string(normalization) + ":" + std::to_string(forward); + @autoreleasepool { + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + auto inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + auto descriptor = [MPSGraphFFTDescriptor descriptor]; + descriptor.scalingMode = normalization_to_ScalingMode(normalization); + descriptor.inverse = !forward; + auto outputTensor = [mpsGraph fastFourierTransformWithTensor:inputTensor + axes:IntArrayToNSArray(dim) + descriptor:descriptor + name:nil]; + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->outputTensor_ = outputTensor; + }); + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } + return out; +} + +} // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/Gamma.mm b/aten/src/ATen/native/mps/operations/Gamma.mm index 1a6bbb25c05f2..826e7acdde358 100644 --- a/aten/src/ATen/native/mps/operations/Gamma.mm +++ b/aten/src/ATen/native/mps/operations/Gamma.mm @@ -24,7 +24,7 @@ * See note [3-Clause BSD License for the Cephes Math Library]. */ -static const char* GAMMA_OPS_TEMPLATE = R"METAL( +static MetalShaderLibrary lib(R"METAL( #include using namespace metal; @@ -388,45 +388,11 @@ kernel void polygamma(device {0} *input [[buffer(0)]], output[id] = sgn * Gamma(n + 1) * calc_zeta(n + 1, x); }} -)METAL"; +)METAL", + 2); -static id compileGammaOpsLibrary(id device, const std::string& t1, const std::string& t2) { - auto key = t1 + t2; - static std::unordered_map> libMap; - auto it = libMap.find(key); - if (it != libMap.end()) { - return it->second; - } - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - auto rc = [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(GAMMA_OPS_TEMPLATE, t1, t2).c_str()] - options:options - error:&error]; - TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]); - libMap[key] = rc; - return rc; -} - -static id getCPLState(id device, - const std::string& t1, - const std::string& t2, - const std::string& fname) { - auto key = t1 + t2 + fname; - static std::unordered_map> cplMap; - auto it = cplMap.find(key); - if (it != cplMap.end()) { - return it->second; - } - NSError* error = nil; - auto library = compileGammaOpsLibrary(device, t1, t2); - id func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]]; - TORCH_CHECK(func != nil, "Can't get function ", fname); - auto rc = [device newComputePipelineStateWithFunction:func error:&error]; - TORCH_CHECK( - rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]); - cplMap[key] = rc; - return rc; +static id getCPLState(const Tensor& t1, const Tensor& t2, const std::string& fname) { + return lib.getPipelineStateForFunc(fname, {scalarToMetalTypeString(t1), scalarToMetalTypeString(t2)}); } } // namespace mps @@ -441,19 +407,15 @@ kernel void polygamma(device {0} *input [[buffer(0)]], return; } - if (!self.is_contiguous()) { + if (mps::needsGather(output_)) { output = output.contiguous(); needs_output_copy = true; } using namespace mps; - std::string input_type = scalarToMetalTypeString(self.scalar_type()); - std::string output_type = scalarToMetalTypeString(output.scalar_type()); - @autoreleasepool { - id device = MPSDevice::getInstance()->device(); - id cplState = getCPLState(device, input_type, output_type, "lgamma"); + id cplState = getCPLState(self, output, "lgamma"); MPSStream* mpsStream = getCurrentMPSStream(); dispatch_sync(mpsStream->queue(), ^() { @@ -485,19 +447,15 @@ kernel void polygamma(device {0} *input [[buffer(0)]], return; } - if (!self.is_contiguous()) { + if (mps::needsGather(output_)) { output = output.contiguous(); needs_output_copy = true; } using namespace mps; - std::string input_type = scalarToMetalTypeString(self.scalar_type()); - std::string output_type = scalarToMetalTypeString(output.scalar_type()); - @autoreleasepool { - id device = MPSDevice::getInstance()->device(); - id cplState = getCPLState(device, input_type, output_type, "digamma"); + id cplState = getCPLState(self, output, "digamma"); MPSStream* mpsStream = getCurrentMPSStream(); dispatch_sync(mpsStream->queue(), ^() { @@ -530,15 +488,13 @@ kernel void polygamma(device {0} *input [[buffer(0)]], return; } - if (!self.is_contiguous()) { + if (mps::needsGather(output_)) { output = output.contiguous(); needs_output_copy = true; } using namespace mps; - std::string input_type = scalarToMetalTypeString(self.scalar_type()); - std::string output_type = scalarToMetalTypeString(output.scalar_type()); std::string func_name; if (order == 0) { @@ -550,9 +506,7 @@ kernel void polygamma(device {0} *input [[buffer(0)]], } @autoreleasepool { - id device = MPSDevice::getInstance()->device(); - - id cplState = getCPLState(device, input_type, output_type, func_name); + id cplState = getCPLState(self, output, func_name); MPSStream* mpsStream = getCurrentMPSStream(); dispatch_sync(mpsStream->queue(), ^() { diff --git a/aten/src/ATen/native/mps/operations/GridSampler.mm b/aten/src/ATen/native/mps/operations/GridSampler.mm index fc775333b6c71..8589ed28dc54e 100644 --- a/aten/src/ATen/native/mps/operations/GridSampler.mm +++ b/aten/src/ATen/native/mps/operations/GridSampler.mm @@ -116,14 +116,8 @@ static void grid_sampler_2d_mps_impl(Tensor& output, Placeholder gridPlaceholder = Placeholder(cachedGraph->gridTensor_, grid); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - gridPlaceholder.getMPSGraphTensor() : gridPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, gridPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } } // namespace mps diff --git a/aten/src/ATen/native/mps/operations/HistogramKernel.mm b/aten/src/ATen/native/mps/operations/HistogramKernel.mm index 874e0173658d3..553d792c94708 100644 --- a/aten/src/ATen/native/mps/operations/HistogramKernel.mm +++ b/aten/src/ATen/native/mps/operations/HistogramKernel.mm @@ -21,7 +21,7 @@ BINARY_SEARCH, }; -static const char* METAL_HISTOGRAM = R"HISTOGRAM_METAL( +static MetalShaderLibrary lib(R"HISTOGRAM_METAL( #include using namespace metal; @@ -157,42 +157,7 @@ kernel void kernel_index_offset(constant uint * strides [[buffer data_offsets[thread_index] += remainder * strides[reversed_dim]; } } -)HISTOGRAM_METAL"; - -static id compileHistogramOpLibrary(id device) { - static id histogramLibrary = nil; - if (histogramLibrary) { - return histogramLibrary; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - histogramLibrary = [device newLibraryWithSource:[NSString stringWithCString:METAL_HISTOGRAM - encoding:NSASCIIStringEncoding] - options:options - error:&error]; - TORCH_CHECK(histogramLibrary, "Failed to create metal histogram library, error: ", [[error description] UTF8String]); - return histogramLibrary; -} - -static id histogramPipelineState(id device, const std::string& kernel) { - static std::unordered_map> psoCache; - id pso = psoCache[kernel]; - if (pso) { - return pso; - } - - NSError* error = nil; - id crossLib = compileHistogramOpLibrary(device); - id crossFunc = [crossLib newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(crossFunc, "Failed to create function state object for: ", kernel); - pso = [device newComputePipelineStateWithFunction:crossFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); - - psoCache[kernel] = pso; - return pso; -} +)HISTOGRAM_METAL"); template void histogramdd_kernel_impl(Tensor& hist_output, @@ -279,7 +244,7 @@ void histogramdd_kernel_impl(Tensor& hist_output, id stridedIndicesBuffer = [[device newBufferWithLength:stridedIndicesNumThreads * sizeof(uint) options:0] autorelease]; - id stridedIndicesPSO = histogramPipelineState(device, "kernel_index_offset"); + id stridedIndicesPSO = lib.getPipelineStateForFunc("kernel_index_offset"); [computeEncoder setComputePipelineState:stridedIndicesPSO]; [computeEncoder setBytes:strides.data() length:sizeof(uint32_t) * nDim atIndex:0]; @@ -289,8 +254,8 @@ void histogramdd_kernel_impl(Tensor& hist_output, mtl_dispatch1DJob(computeEncoder, stridedIndicesPSO, stridedIndicesNumThreads); - const std::string kernel = "histogramdd_" + scalarToMetalTypeString(input.scalar_type()); - id histogramPSO = histogramPipelineState(device, kernel); + const std::string kernel = "histogramdd_" + scalarToMetalTypeString(input); + id histogramPSO = lib.getPipelineStateForFunc(kernel); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(histogramPSO, "histogram", allTensorsList); diff --git a/aten/src/ATen/native/mps/operations/Indexing.mm b/aten/src/ATen/native/mps/operations/Indexing.mm index 01113f699a6b6..55ead2cba1bd8 100644 --- a/aten/src/ATen/native/mps/operations/Indexing.mm +++ b/aten/src/ATen/native/mps/operations/Indexing.mm @@ -39,10 +39,7 @@ #include #include #include -#endif - -#ifdef __OBJC__ -#include +#include #endif namespace at::native { @@ -196,9 +193,8 @@ static void validateInputData(const TensorIteratorBase& iter, TORCH_CHECK(scalar_type == ScalarType::Float || inputTensor.scalar_type() == ScalarType::Int || scalar_type == ScalarType::Bool); } else { - TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || scalar_type == ScalarType::Float || - scalar_type == ScalarType::Half || scalar_type == ScalarType::ComplexFloat || - scalar_type == ScalarType::ComplexHalf, + TORCH_CHECK(c10::isIntegralType(scalar_type, /*includesBool=*/true) || supportedFloatingType(scalar_type) || + scalar_type == ScalarType::ComplexFloat || scalar_type == ScalarType::ComplexHalf, getMPSTypeString(inputTensor) + std::string(" not supported for index.Tensor_out")); } } @@ -245,14 +241,20 @@ static void index_put_kernel_mps(TensorIterator& iter, } // namespace mps static Tensor nonzero_fallback(const Tensor& self) { - TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ", - "Falling back on CPU. This may have performance implications."); - return at::nonzero(self.to("cpu")).clone().to("mps"); } Tensor& nonzero_out_mps(const Tensor& self, Tensor& out_) { - if (!is_macos_13_or_newer()) { + if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { + TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ", + "Falling back on CPU. This may have performance implications."); + Tensor out_fallback = nonzero_fallback(self); + at::native::resize_output(out_, out_fallback.sizes()); + out_.copy_(out_fallback.to("mps")); + return out_; + } else if (self.is_complex()) { + TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes. ", + "Falling back on CPU. This may have performance implications."); Tensor out_fallback = nonzero_fallback(self); at::native::resize_output(out_, out_fallback.sizes()); out_.copy_(out_fallback.to("mps")); @@ -270,7 +272,7 @@ static Tensor nonzero_fallback(const Tensor& self) { TORCH_CHECK(self.numel() < std::numeric_limits::max(), "nonzero is not supported for tensors with more than INT_MAX elements, \ - file a support request"); + See https://github.com/pytorch/pytorch/issues/51871"); TORCH_CHECK( out_.dtype() == at::kLong, "Expected object of scalar type ", at::kLong, " as out, but got ", out_.dtype()); TORCH_CHECK(self.device() == out_.device(), @@ -286,7 +288,6 @@ static Tensor nonzero_fallback(const Tensor& self) { CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} MPSGraphTensor* inputTensor_ = nil; MPSGraphTensor* outputTensor_ = nil; - MPSGraphTensor* scatterDataTensor_ = nil; }; dispatch_sync(stream->queue(), ^() { @@ -298,109 +299,27 @@ static Tensor nonzero_fallback(const Tensor& self) { return out_; } - bool contiguous_output = out_.is_contiguous(); + bool contiguous_output = !needsGather(out_); Tensor out = out_; if (!contiguous_output) { out = at::empty(out_.sizes(), out_.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); } - int64_t _apparentInputShape = 1; - for (auto dim : self.sizes()) { - _apparentInputShape *= dim; - } - MPSShape* apparentOutputShape = @[ @(total_nonzero * nDim) ]; - MPSShape* apparentInputShape = @[ @(_apparentInputShape) ]; - - // Pseudocode: - // - // inputTensor = [1, 0, 0, 3] - // inputNonZero = [1, 0, 0, 1] - // indices = [1, 1, 1, 2] - // maskedIndices = [0, -1, -1, 1] - // coordinates = [0, 1, 2, 3] - // scatterResult = [0, 3] - @autoreleasepool { string key = "nonzero_out_mps" + getTensorsStringKey(self); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSDataType inputDataType = getMPSDataType(self); - MPSShape* inputShape = getMPSShape(self); - - MPSGraphTensor* inputTensor = - mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self.scalar_type()), apparentInputShape); - MPSGraphTensor* scatterDataTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSScalarType(out.scalar_type())); - MPSGraphTensor* zeroTensor = [mpsGraph constantWithScalar:0.0 dataType:inputDataType]; - MPSGraphTensor* oneTensor = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeInt32]; - MPSGraphTensor* minusMaxDimTensor = [mpsGraph constantWithScalar:-maxDimensions dataType:MPSDataTypeInt32]; - MPSGraphTensor* inputNotEqualToZeroTensor = [mpsGraph notEqualWithPrimaryTensor:inputTensor - secondaryTensor:zeroTensor - name:nil]; - MPSGraphTensor* maskTensor = [mpsGraph castTensor:inputNotEqualToZeroTensor - toType:MPSDataTypeInt32 - name:@"castToInt32"]; - MPSGraphTensor* indicesTensor = [mpsGraph cumulativeSumWithTensor:maskTensor axis:0 name:nil]; - MPSGraphTensor* indicesMinusOneTensor = [mpsGraph subtractionWithPrimaryTensor:indicesTensor - secondaryTensor:oneTensor - name:nil]; - MPSGraphTensor* maskedIndicesTensor = [mpsGraph selectWithPredicateTensor:inputNotEqualToZeroTensor - truePredicateTensor:indicesMinusOneTensor - falsePredicateTensor:minusMaxDimTensor - name:nil]; - MPSGraphTensor* coordinatesTensor = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:0 - withShape:inputShape - name:nil] - withShape:@[ @-1 ] - name:nil]; - if (nDim > 1) { - NSMutableArray* maskedIndicesTensorArray = [NSMutableArray arrayWithCapacity:nDim]; - NSMutableArray* coordinatesTensorArray = [NSMutableArray arrayWithCapacity:nDim]; - - MPSGraphTensor* constantRankTensor = [mpsGraph constantWithScalar:nDim dataType:MPSDataTypeInt32]; - maskedIndicesTensorArray[0] = [mpsGraph multiplicationWithPrimaryTensor:maskedIndicesTensor - secondaryTensor:constantRankTensor - name:nil]; - coordinatesTensorArray[0] = coordinatesTensor; - for (int i = 1; i < nDim; i++) { - maskedIndicesTensorArray[i] = [mpsGraph additionWithPrimaryTensor:maskedIndicesTensorArray[i - 1] - secondaryTensor:oneTensor - name:nil]; - coordinatesTensorArray[i] = [mpsGraph reshapeTensor:[mpsGraph coordinateAlongAxis:i - withShape:inputShape - name:nil] - withShape:@[ @-1 ] - name:nil]; - } - maskedIndicesTensor = [mpsGraph concatTensors:maskedIndicesTensorArray dimension:0 interleave:YES name:nil]; - coordinatesTensor = [mpsGraph concatTensors:coordinatesTensorArray dimension:0 interleave:YES name:nil]; - } + MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(self), getMPSShape(self)); - MPSGraphTensor* outputTensor = [mpsGraph scatterWithDataTensor:scatterDataTensor - updatesTensor:coordinatesTensor - indicesTensor:maskedIndicesTensor - axis:0 - mode:MPSGraphScatterModeSet - name:nil]; + MPSGraphTensor* outputTensor = [mpsGraph nonZeroIndicesOfTensor:inputTensor name:nil]; newCachedGraph->inputTensor_ = inputTensor; - newCachedGraph->scatterDataTensor_ = scatterDataTensor; newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self, apparentInputShape); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out, apparentOutputShape); - Placeholder scatterPlaceholder = Placeholder(cachedGraph->scatterDataTensor_, out, apparentOutputShape); - - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - scatterPlaceholder.getMPSGraphTensor() : scatterPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (!contiguous_output) { @@ -411,7 +330,13 @@ static Tensor nonzero_fallback(const Tensor& self) { } Tensor nonzero_mps(const Tensor& self) { - if (!is_macos_13_or_newer()) { + if (!is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS)) { + TORCH_WARN_ONCE("MPS: nonzero op is supported natively starting from macOS 13.0. ", + "Falling back on CPU. This may have performance implications."); + return nonzero_fallback(self); + } else if (self.is_complex()) { + TORCH_WARN_ONCE("MPS: nonzero op is not supported for complex datatypes ", + "Falling back on CPU. This may have performance implications."); return nonzero_fallback(self); } @@ -484,14 +409,8 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) { Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result, /*mpsShape*/ nil, /*gatherTensorData=*/false, outputDataType); - NSDictionary* feeds = - @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - // Run the graph - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; @@ -568,10 +487,7 @@ Tensor flip_mps(const Tensor& self, IntArrayRef dims) { sourcePlaceholder.getMPSGraphTensor() : sourcePlaceholder.getMPSGraphTensorData(), cachedGraph->alphaTensor_ : getMPSGraphTensorFromScalar(stream, alpha_scalar), }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -625,7 +541,6 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { " and ", output.size(dim), "."); - TORCH_CHECK(!self.is_complex(), "index_select(): Yet not supported for complex"); for (const auto i : irange(self.dim())) { if (i == dim) @@ -651,6 +566,14 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { return output; } + // As of MacOS 14.4 gatherWithUpdatesTensor: still does not support complex + // So back to old view_as_real trick + if (self.is_complex()) { + auto out_view = at::view_as_real(output); + index_select_out_mps(at::view_as_real(self), dim, index, out_view); + return output; + } + // Derive from MPSCachedGraph struct CachedGraph : public MPSCachedGraph { CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} @@ -697,14 +620,8 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { /*gatherTensorData=*/false, /*dataType=*/outputType); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output; @@ -785,10 +702,7 @@ Tensor index_select_mps(const Tensor& self, int64_t dim, const Tensor& index) { cachedGraph->valueTensor_ : getMPSGraphTensorFromScalar(stream, valueScalar) }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } namedinference::propagate_names_if_nonempty(self, maybe_outnames); return self; @@ -862,14 +776,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_, auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices); auto outgoingGradPlaceholder = Placeholder(cachedGraph->outgoingGradTensor_, outgoing_gradient); - NSDictionary* feeds = @{ - incomingGradPlaceholder.getMPSGraphTensor() : incomingGradPlaceholder.getMPSGraphTensorData(), - indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outgoingGradPlaceholder.getMPSGraphTensor() : outgoingGradPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(incomingGradPlaceholder, indicesPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outgoingGradPlaceholder); } return outgoing_gradient; } @@ -1010,15 +918,8 @@ Tensor embedding_dense_backward_mps(const Tensor& grad_, /*gatherTensorData=*/false, /*dataType=*/inputType); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData(), - updatePlaceholder.getMPSGraphTensor() : updatePlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder, updatePlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return self; } diff --git a/aten/src/ATen/native/mps/operations/Inverse.mm b/aten/src/ATen/native/mps/operations/Inverse.mm index ae142f02fba46..176222f2deeeb 100644 --- a/aten/src/ATen/native/mps/operations/Inverse.mm +++ b/aten/src/ATen/native/mps/operations/Inverse.mm @@ -53,13 +53,8 @@ Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, A); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - NSDictionary* feeds = - @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/Lerp.mm b/aten/src/ATen/native/mps/operations/Lerp.mm index ca674336a907f..1ad34ef9a566a 100644 --- a/aten/src/ATen/native/mps/operations/Lerp.mm +++ b/aten/src/ATen/native/mps/operations/Lerp.mm @@ -1,5 +1,6 @@ #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #ifndef AT_PER_OPERATOR_HEADERS #include @@ -11,8 +12,38 @@ namespace at::native { TORCH_IMPL_FUNC(lerp_Tensor_mps)(const Tensor& self, const Tensor& end, const Tensor& weight, const Tensor& out) { - // TODO: Write a much better implementation - at::add_out(const_cast(out), self, weight.mul(end.sub(self))); + TORCH_CHECK(out.is_mps()); + std::array args{{{out, "out", 0}, {self, "self", 1}, {end, "end", 2}, {weight, "weight", 3}}}; + checkAllSameGPU(__func__, args); + using namespace mps; + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor* selfTensor_ = nil; + MPSGraphTensor* endTensor_ = nil; + MPSGraphTensor* weightTensor_ = nil; + MPSGraphTensor* outputTensor_ = nil; + }; + @autoreleasepool { + string key = "lerp_Tensor_mps" + getTensorsStringKey({self, end, weight}); + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto graph) { + auto selfTensor = mpsGraphRankedPlaceHolder(mpsGraph, self); + auto endTensor = mpsGraphRankedPlaceHolder(mpsGraph, end); + auto weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight); + auto distance = [mpsGraph subtractionWithPrimaryTensor:endTensor secondaryTensor:selfTensor name:nil]; + auto weighedDistance = [mpsGraph multiplicationWithPrimaryTensor:weightTensor secondaryTensor:distance name:nil]; + auto output = [mpsGraph additionWithPrimaryTensor:selfTensor secondaryTensor:weighedDistance name:nil]; + graph->selfTensor_ = selfTensor; + graph->endTensor_ = endTensor; + graph->weightTensor_ = weightTensor; + graph->outputTensor_ = output; + }); + auto selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); + auto endPlaceholder = Placeholder(cachedGraph->endTensor_, end); + auto weightPlaceholder = Placeholder(cachedGraph->weightTensor_, weight); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, endPlaceholder, weightPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } } } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/Linear.mm b/aten/src/ATen/native/mps/operations/Linear.mm index 4e556189b0f1f..450e24c77c9d4 100644 --- a/aten/src/ATen/native/mps/operations/Linear.mm +++ b/aten/src/ATen/native/mps/operations/Linear.mm @@ -15,11 +15,17 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt auto weight = (weight_arg.dim() == 1) ? weight_arg.view({1, weight_arg.size(0)}) : weight_arg; - TORCH_CHECK(input.scalar_type() == ScalarType::Float || input.scalar_type() == ScalarType::Half, - "MPS device does not support linear for non-float inputs"); + TORCH_CHECK(supportedFloatingType(input), "MPS device does not support linear for non-float inputs"); + TORCH_CHECK(input.is_mps(), "Tensor for argument input is on ", input.device(), " but expected on mps"); + TORCH_CHECK(supportedFloatingType(weight_arg), "MPS device does not support linear for non-float weights"); + TORCH_CHECK(weight_arg.is_mps(), "Tensor for argument weight is on ", weight_arg.device(), " but expected on mps"); const Tensor& bias = *(at::borrow_from_optional_tensor(bias_opt)); - bool is_bias_defined = bias.defined(); + const bool is_bias_defined = bias.defined(); + if (is_bias_defined) { + TORCH_CHECK(bias.is_mps(), "Tensor for argument bias is on ", bias.device(), " but expected on mps"); + TORCH_CHECK(supportedFloatingType(bias), "MPS device does not support linear for non-float bias"); + } auto input_size = input.sizes(); std::vector output_size(input_size.begin(), input_size.end() - 1); @@ -68,31 +74,26 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt dimension:-1 withDimension:-2 name:nil]; - MPSGraphTensor* outputTensor = nil; - - if (!is_bias_defined) { - outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputTensor - secondaryTensor:weightTransposeTensor - name:nil]; - } else { - MPSGraphTensor* inputFlattened = inputTensor; - bool doReshape = false; + // matrixMultiplicationWithPrimary crashes for 5D tensors, see https://github.com/pytorch/pytorch/issues/114942 + bool doReshape = input.dim() > 4; + if (!doReshape && is_bias_defined) { // workaround to improve the performance with 3D+ inputs - if (input_size.size() > 2 && input_size[0] > 1 && input_size[1] >= 1 && input_size[1] <= 32 && - bias.dim() <= 1) { - doReshape = true; - inputFlattened = [mpsGraph flatten2DTensor:inputTensor axis:-1 name:nil]; - } + doReshape = + input_size.size() > 2 && input_size[0] > 1 && input_size[1] >= 1 && input_size[1] <= 32 && bias.dim() <= 1; + } + auto inputFlattened = doReshape ? [mpsGraph flatten2DTensor:inputTensor axis:-1 name:nil] : inputTensor; + auto outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputFlattened + secondaryTensor:weightTransposeTensor + name:nil]; + if (is_bias_defined) { newCachedGraph->biasTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, bias); - MPSGraphTensor* xMulWTTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:inputFlattened - secondaryTensor:weightTransposeTensor - name:nil]; - MPSGraphTensor* biasedTensor = [mpsGraph additionWithPrimaryTensor:xMulWTTensor - secondaryTensor:newCachedGraph->biasTensor_ - name:nil]; - outputTensor = doReshape ? [mpsGraph reshapeTensor:biasedTensor withShape:getMPSShape(output_size) name:nil] - : biasedTensor; + outputTensor = [mpsGraph additionWithPrimaryTensor:outputTensor + secondaryTensor:newCachedGraph->biasTensor_ + name:nil]; + } + if (doReshape) { + outputTensor = [mpsGraph reshapeTensor:outputTensor withShape:getMPSShape(output_size) name:nil]; } newCachedGraph->inputTensor_ = inputTensor; @@ -112,10 +113,7 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias); feeds[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } // Shave off '1' present at the end of the shape @@ -130,13 +128,11 @@ Tensor _mps_linear(const Tensor& input, const Tensor& weight_arg, const c10::opt static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& grad_output, const Tensor& weight) { TORCH_CHECK(grad_output.is_mps(), "mps_linear_backward: grad_output needs to be mps layout"); - TORCH_CHECK(weight.device().is_mps() && (weight.scalar_type() == kFloat || (weight.scalar_type() == kHalf)), + TORCH_CHECK(weight.device().is_mps() && supportedFloatingType(weight), "mps_linear_backward: unsupported weights data type: ", weight.scalar_type()); - TORCH_CHECK(grad_output.scalar_type() == ScalarType::Double || grad_output.scalar_type() == ScalarType::Float || - grad_output.scalar_type() == ScalarType::Half, - "MPS device does not support linear backward for non-float inputs"); + TORCH_CHECK(supportedFloatingType(grad_output), "MPS device does not support linear backward for non-float inputs"); const Tensor weight_reshaped = weight.is_contiguous() ? weight : weight.contiguous(); @@ -159,15 +155,23 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g @autoreleasepool { string key = "mps_linear_backward_input" + getTensorsStringKey({grad_output, weight_reshaped}); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto* mpsGraph, auto* newCachedGraph) { - MPSGraphTensor* weightTensor = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped); - MPSGraphTensor* gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); - - MPSGraphTensor* outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:gradOutputTensor - secondaryTensor:weightTensor - name:nil]; + newCachedGraph->weightTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, weight_reshaped); + newCachedGraph->gradOutputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); + + // MPS matrixMultiplication crashes for 5D+ tensors on 14.2.1 with `New volume should match old volume` + // See https://github.com/pytorch/pytorch/issues/114942 for more details + bool needReshape = grad_output.dim() > 4; + auto gradOutputTensor = needReshape + ? [mpsGraph flatten2DTensor:newCachedGraph->gradOutputTensor_ axis:-1 name:nil] + : newCachedGraph->gradOutputTensor_; + + auto outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:gradOutputTensor + secondaryTensor:newCachedGraph->weightTensor_ + name:nil]; + if (needReshape) { + outputTensor = [mpsGraph reshapeTensor:outputTensor withShape:getMPSShape(output) name:nil]; + } - newCachedGraph->weightTensor_ = weightTensor; - newCachedGraph->gradOutputTensor_ = gradOutputTensor; newCachedGraph->outputTensor_ = outputTensor; }); @@ -175,15 +179,8 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = @{ - weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(weightPlaceholder, gradOutputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); return output; } @@ -196,8 +193,7 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g TORCH_CHECK(grad_output.is_mps() && input.is_mps(), "_mps_linear_backward: grad_output and input needs to be mps layout"); - TORCH_CHECK(grad_output.scalar_type() == ScalarType::Float || grad_output.scalar_type() == ScalarType::Half, - "MPS device does not support linear backward for non-float inputs"); + TORCH_CHECK(supportedFloatingType(grad_output), "MPS device does not support linear backward for non-float inputs"); struct CachedGraph : public MPSCachedGraph { CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} @@ -273,17 +269,9 @@ static Tensor _mps_linear_backward_input(IntArrayRef input_size, const Tensor& g Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, bias); - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - weightPlaceholder.getMPSGraphTensor() : weightPlaceholder.getMPSGraphTensorData() - }; - - NSMutableDictionary* results = [NSMutableDictionary dictionary]; - results[outputPlaceholder.getMPSGraphTensor()] = outputPlaceholder.getMPSGraphTensorData(); - if (bias_defined) - results[biasPlaceholder.getMPSGraphTensor()] = biasPlaceholder.getMPSGraphTensorData(); - + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder, weightPlaceholder); + auto results = bias_defined ? dictionaryFromPlaceholders(outputPlaceholder, biasPlaceholder) + : dictionaryFromPlaceholders(outputPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); return std::tuple{output, bias}; diff --git a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm index 5d2e01f457c2d..002426db125e9 100644 --- a/aten/src/ATen/native/mps/operations/LinearAlgebra.mm +++ b/aten/src/ATen/native/mps/operations/LinearAlgebra.mm @@ -4,6 +4,8 @@ #include #include #include +// For MTLLanguageVersion_3_1 +#include #include #ifndef AT_PER_OPERATOR_HEADERS @@ -22,16 +24,114 @@ namespace at::native { namespace mps { +namespace { +static MetalShaderLibrary lib(R"MATMUL_METAL( +#include + +using namespace metal; +template +T dot_product(constant T *v1, constant T* v2, ulong2 strides, uint32_t size) { + T rc = T(0.0); + for (uint32_t i = 0; i < size; ++i) { + rc += v1[i * strides.x] * v2[i * strides.y]; + } + return rc; +} -enum LinearAlgebraOpType { ADDBMM_OP_TYPE, BADDBMM_OP_TYPE }; +template +kernel void naive_matmul( + constant T * mat1Data [[buffer(0)]], + constant T * mat2Data [[buffer(1)]], + device T * outputData [[buffer(2)]], + constant array & strides [[buffer(3)]], + constant uint3 & sizes [[buffer(4)]], + uint thread_index [[thread_position_in_grid]]) { + uint y = thread_index / sizes.x; + uint x = thread_index % sizes.x; + if (x >= sizes.x || y >= sizes.z) { + return; + } + auto rc = dot_product(mat1Data + x * strides[0].x, + mat2Data + y * strides[1].y, + ulong2(strides[0].y, strides[1].x), + sizes.y); + outputData[x * strides[2].x + y * strides[2].y] = rc; +} + +#define INSTANTIATE_NAIVE_MM(DTYPE) \ +template \ +[[host_name("naive_matmul_" #DTYPE)]] \ +kernel void naive_matmul( \ + constant DTYPE * mat1Data [[buffer(0)]], \ + constant DTYPE * mat2Data [[buffer(1)]], \ + device DTYPE * outputData [[buffer(2)]], \ + constant array & strides [[buffer(3)]], \ + constant uint3 & sizes [[buffer(4)]], \ + uint thread_index [[thread_position_in_grid]]) + +INSTANTIATE_NAIVE_MM(float); +INSTANTIATE_NAIVE_MM(half); +#if __METAL_VERSION__ >= 310 +INSTANTIATE_NAIVE_MM(bfloat); +#endif +)MATMUL_METAL"); + +Tensor& do_metal_mm(const Tensor& self, const Tensor& other, Tensor& output) { + auto stream = getCurrentMPSStream(); + auto device = MPSDevice::getInstance()->device(); + auto matmulPSO = lib.getPipelineStateForFunc("naive_matmul_" + mps::scalarToMetalTypeString(output)); + dispatch_sync_with_rethrow(stream->queue(), ^() { + @autoreleasepool { + getMPSProfiler().beginProfileKernel(matmulPSO, "naive_matmul", {self, other}); + auto computeEncoder = stream->commandEncoder(); + [computeEncoder setComputePipelineState:matmulPSO]; + std::array sizes = {static_cast(self.size(0)), + static_cast(self.size(1)), + static_cast(output.size(1))}; + std::array strides = { + self.stride(0), self.stride(1), other.stride(0), other.stride(1), output.stride(0), output.stride(1)}; + mtl_setBuffer(computeEncoder, self, 0); + mtl_setBuffer(computeEncoder, other, 1); + mtl_setBuffer(computeEncoder, output, 2); + [computeEncoder setBytes:strides.data() length:sizeof(uint64_t) * strides.size() atIndex:3]; + [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4]; + mtl_dispatch1DJob(computeEncoder, matmulPSO, output.numel()); + getMPSProfiler().endProfileKernel(matmulPSO); + } + }); + return output; +} + +std::tuple do_mm(MPSGraph* graph, + const Tensor& self, + const Tensor& other) { + if (self.numel() == 0 || other.numel() == 0) { + auto output = [graph constantWithScalar:0.0 + shape:getMPSShape({self.size(0), other.size(1)}) + dataType:getMPSDataType(self)]; + return {nil, nil, output}; + } + auto selfTensor = mpsGraphRankedPlaceHolder(graph, self); + auto otherTensor = mpsGraphRankedPlaceHolder(graph, other); + auto output = [graph matrixMultiplicationWithPrimaryTensor:selfTensor secondaryTensor:otherTensor name:nil]; + return {selfTensor, otherTensor, output}; +} + +bool use_metal_mm(const Tensor& self, const Tensor& other, const Tensor& output) { + static bool always_use_metal = std::getenv("PYTORCH_MPS_PREFER_METAL") != nullptr; + constexpr auto max_stride_size = 32768; + return always_use_metal || self.stride(0) > max_stride_size || self.stride(1) > max_stride_size || + self.size(0) > max_stride_size || self.size(1) > max_stride_size || other.stride(0) > max_stride_size || + other.stride(1) > max_stride_size || other.size(0) > max_stride_size || other.size(1) > max_stride_size; +} + +} // anonymous namespace static Tensor& mm_out_mps_impl(const Tensor& self, const Tensor& other, Tensor& output) { using namespace mps; using CachedGraph = MPSBinaryCachedGraph; TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK(self.scalar_type() == ScalarType::Double || self.scalar_type() == ScalarType::Float || - self.scalar_type() == ScalarType::Half, - "MPS device does not support mm for non-float inputs"); + TORCH_CHECK(supportedFloatingType(self), "MPS device does not support mm for non-float inputs"); TensorArg args[]{{output, "out", 0}, {self, "mat1", 1}, {other, "mat2", 2}}; checkAllSameGPU("mm", args); @@ -39,60 +139,38 @@ TORCH_CHECK(output.is_mps()); // Transpose inputs if needed - IntArrayRef output_sizes = output.sizes(); - if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) { + if (output.numel() == 0) { return output; } - MPSStream* stream = getCurrentMPSStream(); + // MPS matmul returns silently incorrect results if one of the matrix dimensions is greater than 2**15 + // And crashes if its a view of matrix with dimensions larger than 2**15 + // See https://github.com/pytorch/pytorch/issues/116769#issuecomment-1888302095 + // In such cases, fallback to naive but accurate metal shader + if (use_metal_mm(self, other, output)) { + return do_metal_mm(self, other, output); + } @autoreleasepool { string key = "mm_out_mps_impl" + getTensorsStringKey({self, other}); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* selfTensor = nil; - MPSGraphTensor* otherTensor = nil; - MPSGraphTensor* outputTensor = nil; - - if (self.numel() == 0 || other.numel() == 0) { - outputTensor = [mpsGraph constantWithScalar:0. shape:getMPSShape(output_sizes) dataType:getMPSDataType(output)]; - - } else { - selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); - otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other); - outputTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:selfTensor secondaryTensor:otherTensor name:nil]; - } - - newCachedGraph->inputTensor_ = selfTensor; - newCachedGraph->otherTensor_ = otherTensor; - newCachedGraph->outputTensor_ = outputTensor; + std::tie(newCachedGraph->inputTensor_, newCachedGraph->otherTensor_, newCachedGraph->outputTensor_) = + do_mm(mpsGraph, self, other); }); - Placeholder selfPlaceholder = Placeholder(); - Placeholder otherPlaceholder = Placeholder(); + auto selfPlaceholder = self.numel() != 0 ? Placeholder(cachedGraph->inputTensor_, self) : Placeholder(); + auto otherPlaceholder = other.numel() != 0 ? Placeholder(cachedGraph->otherTensor_, other) : Placeholder(); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - if (!(self.numel() == 0 || other.numel() == 0)) { - selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); - otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); - } - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - - NSDictionary* feeds = nil; - - if (!(self.numel() == 0 || other.numel() == 0)) - feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = self.numel() != 0 ? dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder) : nil; + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output; } +enum LinearAlgebraOpType { ADDBMM_OP_TYPE, BADDBMM_OP_TYPE }; + static Tensor& addbmm_or_baddbmm_out_mps_impl(const Tensor& input, const Tensor& batch1, const Tensor& batch2, @@ -107,9 +185,7 @@ TORCH_CHECK(batch2.is_mps()); TORCH_CHECK(result.is_mps()); - TORCH_CHECK(batch1.scalar_type() == ScalarType::Double || batch1.scalar_type() == ScalarType::Float || - batch1.scalar_type() == ScalarType::Half, - "MPS device does not support addbmm or baddbmm for non-float inputs"); + TORCH_CHECK(supportedFloatingType(batch1), "MPS device does not support addbmm or baddbmm for non-float inputs"); TORCH_CHECK(batch1.dim() == 3, "batch1 must be a 3D tensor"); TORCH_CHECK(batch2.dim() == 3, "batch2 must be a 3D tensor"); @@ -193,21 +269,14 @@ newCachedGraph->batch2Tensor_ = batch2Tensor; newCachedGraph->outputTensor_ = outputTensor; }); + Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input); Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1); Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(), - batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, batch1Placeholder, batch2Placeholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; @@ -223,9 +292,7 @@ TORCH_CHECK(output.is_mps()); TORCH_CHECK(self.dim() == 2 && other.dim() == 2, "tensors must be 2-D"); - TORCH_CHECK(self.scalar_type() == ScalarType::Double || self.scalar_type() == ScalarType::Float || - self.scalar_type() == ScalarType::Half, - "MPS device does not support addmm for non-float input"); + TORCH_CHECK(supportedFloatingType(self), "MPS device does not support addmm for non-float input"); TensorArg args[]{{output, "out", 0}, {bias, "self", 1}, {self, "mat1", 2}, {other, "mat2", 3}}; checkAllSameGPU(__func__, args); @@ -248,13 +315,10 @@ if (&output != &self) { output.resize_(bias_sizes); } - IntArrayRef output_sizes = output.sizes(); - if ((output_sizes[0] == 0) || (output_sizes[1] == 0)) { + if (output.numel() == 0) { return output; } - MPSStream* stream = getCurrentMPSStream(); - bool is_beta_non_zero = beta.toDouble() != 0.0; struct CachedGraph : public mps::MPSCachedGraph { @@ -269,15 +333,13 @@ string key = "addmm_out_mps_impl" + getTensorsStringKey({self, other, *bias_}) + ":" + to_string(beta.toDouble()) + ":" + to_string(alpha.toDouble()); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* selfTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, self); - MPSGraphTensor* otherTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, other); - MPSGraphTensor* biasTensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, *bias_); + MPSGraphTensor* selfTensor = nil; + MPSGraphTensor* otherTensor = nil; + MPSGraphTensor* productTensor = nil; + MPSGraphTensor* biasTensor = mpsGraphRankedPlaceHolder(mpsGraph, *bias_); // TODO: Use alpha and beta here with fill_.Scalar and mul - // Intermediate as placeholder - MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:selfTensor - secondaryTensor:otherTensor - name:@"MM/(mat1@mat2)"]; + std::tie(selfTensor, otherTensor, productTensor) = do_mm(mpsGraph, self, other); auto productTimesAlphaTensor = productTensor; if (alpha.toDouble() != 1.0) { @@ -309,21 +371,14 @@ newCachedGraph->outputTensor_ = outputTensor; }); - Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, self); - Placeholder otherPlaceholder = Placeholder(cachedGraph->otherTensor_, other); + Placeholder selfPlaceholder = self.numel() != 0 ? Placeholder(cachedGraph->selfTensor_, self) : Placeholder(); + Placeholder otherPlaceholder = other.numel() != 0 ? Placeholder(cachedGraph->otherTensor_, other) : Placeholder(); Placeholder biasPlaceholder = Placeholder(cachedGraph->biasTensor_, *bias_); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData(), - biasPlaceholder.getMPSGraphTensor() : biasPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = self.numel() != 0 ? dictionaryFromPlaceholders(selfPlaceholder, otherPlaceholder, biasPlaceholder) + : dictionaryFromPlaceholders(biasPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output; @@ -332,15 +387,32 @@ static Tensor& bmm_out_mps_impl(const Tensor& batch1, const Tensor& batch2, Tensor& result) { using namespace mps; - TORCH_CHECK(batch1.scalar_type() == ScalarType::Double || batch1.scalar_type() == ScalarType::Float || - batch1.scalar_type() == ScalarType::Half, - "MPS device does not support bmm for non-float inputs"); + TORCH_CHECK(supportedFloatingType(batch1), "MPS device does not support bmm for non-float inputs"); if (batch1.numel() == 0 || batch2.numel() == 0) { result.zero_(); return result; } + MPSShape* shape = nil; + bool doTranspose = false; + + // Handle transposes for the second batch of matrices. + if (batch2.is_view() && !batch2.is_contiguous()) { + if (batch2.numel() == batch2._base().numel()) { + const IntArrayRef& viewSizes = batch2.sizes(); + + // Handle 3D and 4D tensors. + // For 4D tensors, first it must have been reshaped from 4D to 3D and then transposed. + int32_t baseTransposeStrideDim = batch2._base().dim() == 4 ? -3 : -2; + if (batch2._base().stride(0) == batch2.stride(0) && + batch2._base().stride(baseTransposeStrideDim) == batch2.stride(-1)) { + shape = @[ @(viewSizes[0]), @(viewSizes[2]), @(viewSizes[1]) ]; + doTranspose = true; + } + } + } + MPSStream* stream = getCurrentMPSStream(); struct CachedGraph : public mps::MPSCachedGraph { @@ -351,14 +423,20 @@ }; @autoreleasepool { - string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}); + string key = "bmm_out_mps_impl" + getTensorsStringKey({batch1, batch2}, true, /*exclude_shape*/ true) + + std::to_string(doTranspose); auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* batch1Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch1); - MPSGraphTensor* batch2Tensor = mps::mpsGraphRankedPlaceHolder(mpsGraph, batch2); + MPSGraphTensor* batch1Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch1.scalar_type())); + MPSGraphTensor* batch2Tensor = mps::mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(batch2.scalar_type())); + MPSGraphTensor* batch2TensorTranspose = batch2Tensor; + + if (doTranspose) { + batch2TensorTranspose = [mpsGraph transposeTensor:batch2Tensor dimension:-1 withDimension:-2 name:nil]; + } MPSGraphTensor* productTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:batch1Tensor - secondaryTensor:batch2Tensor + secondaryTensor:batch2TensorTranspose name:@"MM/(batch1@batch2)"]; newCachedGraph->batch1Tensor_ = batch1Tensor; @@ -366,18 +444,11 @@ newCachedGraph->outputTensor_ = productTensor; }); Placeholder batch1Placeholder = Placeholder(cachedGraph->batch1Tensor_, batch1); - Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2); + Placeholder batch2Placeholder = Placeholder(cachedGraph->batch2Tensor_, batch2, shape, !doTranspose); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result); - NSDictionary* feeds = @{ - batch1Placeholder.getMPSGraphTensor() : batch1Placeholder.getMPSGraphTensorData(), - batch2Placeholder.getMPSGraphTensor() : batch2Placeholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(batch1Placeholder, batch2Placeholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; @@ -496,9 +567,7 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons TORCH_CHECK(result.is_mps()); TORCH_CHECK(vec1.dim() == 1 && vec2.dim() == 1, "tensors must be 1-D"); - TORCH_CHECK(vec1.scalar_type() == ScalarType::Double || vec1.scalar_type() == ScalarType::Float || - vec1.scalar_type() == ScalarType::Half, - "MPS device does not support addr for non-float input"); + TORCH_CHECK(supportedFloatingType(vec1), "MPS device does not support addr for non-float input"); TensorArg args[]{{result, "out", 0}, {self, "self", 1}, {vec1, "vec1", 2}, {vec2, "vec2", 3}}; checkAllSameGPU(__func__, args); @@ -592,16 +661,8 @@ Tensor addr_mps(const Tensor& self, const Tensor& vec1, const Tensor& vec2, cons Placeholder selfPlaceholder = Placeholder(cachedGraph->selfTensor_, *self_); Placeholder resultPlaceholder = Placeholder(cachedGraph->resultTensor_, result); - NSDictionary* feeds = @{ - vec1Placeholder.getMPSGraphTensor() : vec1Placeholder.getMPSGraphTensorData(), - vec2Placeholder.getMPSGraphTensor() : vec2Placeholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData() - }; - - NSDictionary* results = - @{resultPlaceholder.getMPSGraphTensor() : resultPlaceholder.getMPSGraphTensorData()}; - - mps::runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(vec1Placeholder, vec2Placeholder, selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, resultPlaceholder); } return result; diff --git a/aten/src/ATen/native/mps/operations/LossOps.mm b/aten/src/ATen/native/mps/operations/LossOps.mm index 8f563beabe250..3e58d2ca8a4b2 100644 --- a/aten/src/ATen/native/mps/operations/LossOps.mm +++ b/aten/src/ATen/native/mps/operations/LossOps.mm @@ -76,7 +76,7 @@ static string reductionToString(int64_t reduction) { newCachedGraph->targetTensor = mpsGraphRankedPlaceHolder(mpsGraph, target); newCachedGraph->gradOutputTensor = mpsGraphRankedPlaceHolder(mpsGraph, grad_output); - MPSGraphTensor* normTensor = [mpsGraph constantWithScalar:norm dataType:MPSDataTypeFloat32]; + MPSGraphTensor* normTensor = [mpsGraph constantWithScalar:norm dataType:[newCachedGraph->inputTensor dataType]]; MPSGraphTensor* diffTensor = [mpsGraph subtractionWithPrimaryTensor:newCachedGraph->inputTensor secondaryTensor:newCachedGraph->targetTensor name:nil]; @@ -92,15 +92,8 @@ static string reductionToString(int64_t reduction) { Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor, grad_input); Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor, grad_output); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder, gradOutputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder); } return grad_input; @@ -123,11 +116,12 @@ static string reductionToString(int64_t reduction) { static MPSGraphTensor* bce_forward_mps(CachedGraph* bceGraph) { MPSGraph* mpsGraph = bceGraph->graph(); + const auto inputType = [bceGraph->inputTensor dataType]; // Forward BCE: L = -w (y ln(x) + (1-y) ln(1-x)) - MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeFloat32]; + MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:inputType]; // -100 is the hard limit value defined in BCELoss Spec. to clamp the log - MPSGraphTensor* neg100 = [mpsGraph constantWithScalar:-100.0 dataType:MPSDataTypeFloat32]; + MPSGraphTensor* neg100 = [mpsGraph constantWithScalar:-100.0 dataType:inputType]; // 1 - x MPSGraphTensor* one_Input = [mpsGraph subtractionWithPrimaryTensor:one secondaryTensor:bceGraph->inputTensor @@ -161,11 +155,12 @@ static string reductionToString(int64_t reduction) { static MPSGraphTensor* bce_backward_mps(CachedGraph* bceGraph) { MPSGraph* mpsGraph = bceGraph->graph(); + const auto inputType = [bceGraph->inputTensor dataType]; // Backward BCE: d(L)/d(x) = -w (y - x) / (x - x^2) - MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:MPSDataTypeFloat32]; + MPSGraphTensor* one = [mpsGraph constantWithScalar:1.0 dataType:inputType]; // epsilon used to clamp the grad input denominator - MPSGraphTensor* epsilon = [mpsGraph constantWithScalar:1e-12 dataType:MPSDataTypeFloat32]; + MPSGraphTensor* epsilon = [mpsGraph constantWithScalar:1e-12 dataType:inputType]; // 1 - x MPSGraphTensor* one_Input = [mpsGraph subtractionWithPrimaryTensor:one secondaryTensor:bceGraph->inputTensor @@ -245,7 +240,7 @@ static string reductionToString(int64_t reduction) { if (grad_output.defined()) { if (reduction == at::Reduction::Mean) { MPSGraphTensor* inputNumel = [mpsGraph constantWithScalar:static_cast(input.numel()) - dataType:MPSDataTypeFloat32]; + dataType:[bceLoss dataType]]; newCachedGraph->gradInputTensor = [mpsGraph divisionWithPrimaryTensor:bceLoss secondaryTensor:inputNumel name:nil]; @@ -273,10 +268,7 @@ static string reductionToString(int64_t reduction) { feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{lossPlaceholder.getMPSGraphTensor() : lossPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, lossPlaceholder); } return loss; @@ -411,10 +403,7 @@ static void nllnd_loss_backward_impl(Tensor& grad_input_arg, if (isWeightsArrayValid) { feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -581,11 +570,7 @@ static void nllnd_loss_forward_impl(Tensor& output, if (isWeightsArrayValid) feeds[weightPlaceholder.getMPSGraphTensor()] = weightPlaceholder.getMPSGraphTensorData(); - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - totalWeightsPlaceholder.getMPSGraphTensor() : totalWeightsPlaceholder.getMPSGraphTensorData() - }; - + auto results = dictionaryFromPlaceholders(outputPlaceholder, totalWeightsPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } @@ -680,14 +665,8 @@ static void smooth_l1_loss_impl(const Tensor& input, Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target, mpsInputShape); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, mpsOutputShape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -804,15 +783,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output, Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad_output); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder, gradOutputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -887,15 +859,8 @@ static void smooth_l1_loss_backward_impl(const Tensor& grad_output, Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output; } @@ -1002,23 +967,23 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor_, target); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, grad_input); - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder, targetPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return grad_input; } // MSELoss -TORCH_IMPL_FUNC(mse_loss_out_mps)(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output) { +TORCH_IMPL_FUNC(mse_loss_out_mps)(const Tensor& input, const Tensor& target, int64_t reduction, const Tensor& output_) { string op_name = __func__; using namespace mps; + + bool contiguousOutput = output_.is_contiguous(); + Tensor output = output_; + if (!contiguousOutput) { + output = output_.contiguous(); + } + TORCH_CHECK(target.is_same_size(input), op_name + ": target and input tensors must have identical shapes") TORCH_CHECK(output.is_mps()); @@ -1043,16 +1008,14 @@ Tensor huber_loss_mps(const Tensor& input, const Tensor& target, int64_t reducti }); Placeholder inputPlaceholder = Placeholder(cachedGraph->inputTensor, input); Placeholder targetPlaceholder = Placeholder(cachedGraph->targetTensor, target); - Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, output); + Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor, contiguousOutput ? output_ : output); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - targetPlaceholder.getMPSGraphTensor() : targetPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, targetPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + if (!contiguousOutput) { + output_.copy_(output); } } diff --git a/aten/src/ATen/native/mps/operations/Normalization.mm b/aten/src/ATen/native/mps/operations/Normalization.mm index eb754ae597689..bdca3b09780b2 100644 --- a/aten/src/ATen/native/mps/operations/Normalization.mm +++ b/aten/src/ATen/native/mps/operations/Normalization.mm @@ -10,7 +10,9 @@ #include #include #else +#include #include +#include #include #include #include @@ -406,6 +408,36 @@ Check if running mean exists (maybe do this check before making graph) return std::make_tuple(output, save_mean, save_var); } +std::tuple _batch_norm_with_update_mps(const Tensor& input, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + Tensor& running_mean, + Tensor& running_var, + double momentum, + double eps) { + Tensor output, save_mean, save_var; + std::tie(output, save_mean, save_var) = + batch_norm_mps(input, weight_opt, bias_opt, running_mean, running_var, /*train*/ true, momentum, eps); + Tensor reserve = at::empty({0}, input.options().dtype(kByte)); + return std::tuple(output, save_mean, save_var, reserve); +} + +std::tuple _batch_norm_with_update_mps_out(const Tensor& input, + const c10::optional& weight_opt, + const c10::optional& bias_opt, + Tensor& running_mean, + Tensor& running_var, + double momentum, + double eps, + Tensor& out, + Tensor& save_mean, + Tensor& save_var, + Tensor& reserve) { + std::tie(out, save_mean, save_var) = batch_norm_mps_out( + input, weight_opt, bias_opt, running_mean, running_var, /*update*/ true, momentum, eps, out, save_mean, save_var); + return std::tuple(out, save_mean, save_var, reserve); +} + std::tuple _batch_norm_legit_mps(const Tensor& self, const c10::optional& weight_opt, const c10::optional& bias_opt, @@ -471,6 +503,29 @@ static string get_mem_string(c10::MemoryFormat memory_format) { } // Batch norm backward +std::tuple _new_batch_norm_backward_mps(const Tensor& grad_output, + const Tensor& input, + const Tensor& weight, + const c10::optional& running_mean_opt, + const c10::optional& running_var_opt, + const c10::optional& save_mean_opt, + const c10::optional& save_var_opt, + bool update, + double eps, + std::array grad_input_mask, + const Tensor& reserve) { + return batch_norm_backward_mps(grad_output, + input, + weight, + running_mean_opt, + running_var_opt, + save_mean_opt, + save_var_opt, + update, + eps, + grad_input_mask); +} + std::tuple batch_norm_backward_mps(const Tensor& grad_out, const Tensor& input, const c10::optional& weight_opt, diff --git a/aten/src/ATen/native/mps/operations/Pad.mm b/aten/src/ATen/native/mps/operations/Pad.mm index 377bbb236f884..badf2c064564e 100644 --- a/aten/src/ATen/native/mps/operations/Pad.mm +++ b/aten/src/ATen/native/mps/operations/Pad.mm @@ -317,9 +317,7 @@ if (is_backward_pass) { feeds[gradOutputPlaceholder.getMPSGraphTensor()] = gradOutputPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output; } @@ -467,7 +465,7 @@ Tensor replication_pad3d_backward_mps(const Tensor& grad_output, const Tensor& i return mps::pad_out_template(grad_input, input, padding, grad_output, MPSGraphPaddingModeClampToEdge, 0.0, __func__); } -// backward pass is exlicitly handled in autograd by negating the "pad" argument +// backward pass is explicitly handled in autograd by negating the "pad" argument Tensor constant_pad_nd_mps(const Tensor& self, IntArrayRef pad, const Scalar& value) { if (pad.size() > 6) { TORCH_WARN_ONCE("MPS: The constant padding of more than 3 dimensions is not currently supported natively. ", diff --git a/aten/src/ATen/native/mps/operations/PixelShuffle.mm b/aten/src/ATen/native/mps/operations/PixelShuffle.mm index 30e85bfde4ec1..f93fb62dc23c5 100644 --- a/aten/src/ATen/native/mps/operations/PixelShuffle.mm +++ b/aten/src/ATen/native/mps/operations/PixelShuffle.mm @@ -75,15 +75,8 @@ static Tensor pixel_shuffle_helper(const Tensor& self, int64_t factor, bool upsc Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - - // Create dictionary of inputs and outputs - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output; diff --git a/aten/src/ATen/native/mps/operations/PointwiseOps.mm b/aten/src/ATen/native/mps/operations/PointwiseOps.mm index 364acb4323f42..137c14be6ef4d 100644 --- a/aten/src/ATen/native/mps/operations/PointwiseOps.mm +++ b/aten/src/ATen/native/mps/operations/PointwiseOps.mm @@ -86,10 +86,7 @@ static void addc_mul_div_out_mps(const Tensor& self, cachedGraph->valueTensor : getMPSGraphTensorFromScalar(mpsStream, value_scalar), }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results); + runMPSGraph(mpsStream, cachedGraph->graph(), feeds, outputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/Quantized.mm b/aten/src/ATen/native/mps/operations/Quantized.mm new file mode 100644 index 0000000000000..3743c6f13c371 --- /dev/null +++ b/aten/src/ATen/native/mps/operations/Quantized.mm @@ -0,0 +1,202 @@ +#define TORCH_ASSERT_ONLY_METHOD_OPERATORS + +#ifndef AT_PER_OPERATOR_HEADERS +#include +#include +#else +#include +#include +#include +#endif +#include +#include +// For Metal3_1 +#include +#include + +namespace at::native { + +using namespace mps; + +static at::native::mps::MetalShaderLibrary lib(R"METAL_QUANTIZED( +#include +using namespace metal; + +// A is sizes.x x sizes.y +// B.T is sizes.z x sizes.y +// C is sizes.x x sizes.z + +template +kernel void int4pack_mm( + constant T * A [[buffer(0)]], + constant uchar * B [[buffer(1)]], + constant T * scalesAndZeros [[buffer(2)]], + device T * outputData [[buffer(3)]], + constant uint3 & sizes [[buffer(4)]], + uint thread_index [[thread_position_in_grid]]) { + const uint lda = sizes.y; + const uint ldc = sizes.z; + const uint m = thread_index / sizes.z; // 0..sizes.x-1 + const uint n = thread_index % sizes.z; // 0..sizes.z-1 + const uint nb = n / 32; + const uint ldb = min(32U, sizes.z - nb * 32); + const uint32_t k_block = (sizes.y + groupSize - 1) / groupSize; + constant T *A_ptr = A + m * lda; + constant uchar *B_ptr = B + (nb * 16 * sizes.y); + + float rc = 0.0; + uint k = 0; + for (uint32_t kb = 0; kb < k_block ; kb ++) { + const T scale = scalesAndZeros[(kb * ldc + n) * 2 + 0]; + const T zero = scalesAndZeros[(kb * ldc + n) * 2 + 1] - scale * T(8); + for(uint idx = 0; idx < groupSize && k < sizes.y; idx++, k++) { + const auto a_val = float(A_ptr[k]); + uchar b_val = B_ptr[(k * ldb + (n % 32))/2]; + b_val = (n & 1) == 0 ? b_val & 0x0f : (b_val >> 4); + rc += a_val * float(scale * T(b_val) + zero); + } + } + outputData[thread_index] = T(rc); +} + +#define INSTANTIATE_INT4MM(DTYPE, GSIZE) \ +template \ +[[host_name("int4pack_mm_" #GSIZE "_" #DTYPE)]] \ +kernel void int4pack_mm( \ + constant DTYPE * A [[buffer(0)]], \ + constant uchar * B [[buffer(1)]], \ + constant DTYPE * scalesAndZeros [[buffer(2)]], \ + device DTYPE * outputData [[buffer(3)]], \ + constant uint3 & sizes [[buffer(4)]], \ + uint thread_index [[thread_position_in_grid]]) + +INSTANTIATE_INT4MM(float, 32); +INSTANTIATE_INT4MM(half, 32); +INSTANTIATE_INT4MM(float, 64); +INSTANTIATE_INT4MM(half, 64); +INSTANTIATE_INT4MM(float, 128); +INSTANTIATE_INT4MM(half, 128); +INSTANTIATE_INT4MM(float, 256); +INSTANTIATE_INT4MM(half, 256); +#if __METAL_VERSION__ >= 310 +INSTANTIATE_INT4MM(bfloat, 32); +INSTANTIATE_INT4MM(bfloat, 64); +INSTANTIATE_INT4MM(bfloat, 128); +INSTANTIATE_INT4MM(bfloat, 256); +#endif +)METAL_QUANTIZED"); + +Tensor _weight_int4pack_mm_mps(const Tensor& A, const Tensor& B, int64_t qGroupSize, const Tensor& qScaleAndZeros) { + constexpr int64_t kNTileSize = 8; + + auto M = A.size(0); + auto N = B.size(0) * kNTileSize; + auto K = A.size(1); + + TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat, + __func__, + " : expect A to be either 32-bit or 16-bit float tensor."); + TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous."); + TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor."); + + TORCH_CHECK(B.dtype() == kInt, __func__, " : expect B to be int32 tensor."); + TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous."); + TORCH_CHECK(B.dim() == 4, __func__, " : expect B to 4d tensor."); + + TORCH_CHECK(qGroupSize == 32 || qGroupSize == 64 || qGroupSize == 128 || qGroupSize == 256, + __func__, + ": expect qGroupSize to be 32, 64, 128 or 256, got ", + qGroupSize); + + TORCH_CHECK(qScaleAndZeros.dim() == 3 && qScaleAndZeros.size(1) == N && qScaleAndZeros.size(2) == 2, + __func__, + ": expect qScaleAndZeros to be 3d tensor with sizes [:, ", + N, + ", 2]"); + + auto C = at::empty({M, N}, A.options()); + id device = MPSDevice::getInstance()->device(); + MPSStream* mpsStream = getCurrentMPSStream(); + std::array sizes = {static_cast(M), static_cast(K), static_cast(N)}; + static bool firstCapture = false; + dispatch_sync_with_rethrow(mpsStream->queue(), ^() { + @autoreleasepool { +#if _CAPTURE_KERNEL + auto& profiler = getMPSProfiler(); + if (profiler.isCaptureEnabled()) { + profiler.startCapture(__func__, mpsStream); + } +#endif + id computeEncoder = mpsStream->commandEncoder(); + const std::string kernel = fmt::format("int4pack_mm_{}_{}", qGroupSize, scalarToMetalTypeString(A)); + id quantizedPSO = lib.getPipelineStateForFunc(kernel); + [computeEncoder setComputePipelineState:quantizedPSO]; + mtl_setBuffer(computeEncoder, A, 0); + mtl_setBuffer(computeEncoder, B, 1); + mtl_setBuffer(computeEncoder, qScaleAndZeros, 2); + mtl_setBuffer(computeEncoder, C, 3); + [computeEncoder setBytes:sizes.data() length:sizeof(uint32_t) * sizes.size() atIndex:4]; + mtl_dispatch1DJob(computeEncoder, quantizedPSO, C.numel()); +#if _CAPTURE_KERNEL + if (profiler.isCapturing()) { + profiler.stopCapture(mpsStream); + } +#endif + } + }); + return C; +} + +Tensor _weight_int8pack_mm_mps(const Tensor& A, const Tensor& B, const Tensor& scales) { + auto M = A.size(0); + auto N = B.size(0); + auto K = A.size(1); + + TORCH_CHECK(A.dtype() == kBFloat16 || A.dtype() == kHalf || A.dtype() == kFloat, + __func__, + " : expect A to be either 32-bit or 16-bit float tensor."); + TORCH_CHECK(A.is_contiguous(), __func__, " : expect A to be contiguous."); + TORCH_CHECK(A.dim() == 2, __func__, " : expect A to be 2D tensor."); + + TORCH_CHECK(B.dtype() == kChar, __func__, " : expect B to be int8 tensor."); + TORCH_CHECK(B.is_contiguous(), __func__, " : expect B to be contiguous."); + TORCH_CHECK(B.size(1) == K, __func__, " : expect B.size(1) == ", K); + + TORCH_CHECK(scales.dim() == 1 && scales.size(0) == N, __func__, " : expect scales to be 1d tensor with size ", N); + + auto C = at::empty({M, N}, A.options()); + + struct CachedGraph : public MPSCachedGraph { + CachedGraph(MPSGraph* graph) : MPSCachedGraph(graph) {} + MPSGraphTensor *ATensor = nil, *BTensor = nil, *scalesTensor = nil; + MPSGraphTensor* outputTensor = nil; + }; + @autoreleasepool { + std::string key = __func__ + getTensorsStringKey({A, B, scales}); + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + newCachedGraph->ATensor = mpsGraphRankedPlaceHolder(mpsGraph, A); + newCachedGraph->BTensor = mpsGraphRankedPlaceHolder(mpsGraph, B); + newCachedGraph->scalesTensor = mpsGraphRankedPlaceHolder(mpsGraph, scales); + auto castB = castMPSTensor(mpsGraph, newCachedGraph->BTensor, getMPSScalarType(A)); + auto transposedB = [mpsGraph transposeTensor:castB dimension:-1 withDimension:-2 name:nil]; + auto mmTensor = [mpsGraph matrixMultiplicationWithPrimaryTensor:newCachedGraph->ATensor + secondaryTensor:transposedB + name:nil]; + newCachedGraph->outputTensor = [mpsGraph multiplicationWithPrimaryTensor:mmTensor + secondaryTensor:newCachedGraph->scalesTensor + name:nil]; + }); + auto APlaceholder = Placeholder(cachedGraph->ATensor, A); + auto BPlaceholder = Placeholder(cachedGraph->BTensor, B); + auto scalesPlaceholder = Placeholder(cachedGraph->scalesTensor, scales); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, C); + runMPSGraph(getCurrentMPSStream(), + cachedGraph->graph(), + dictionaryFromPlaceholders(APlaceholder, BPlaceholder, scalesPlaceholder), + outputPlaceholder); + } + + return C; +} + +} // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/RangeFactories.mm b/aten/src/ATen/native/mps/operations/RangeFactories.mm index 682679aa2045c..102c54c251dba 100644 --- a/aten/src/ATen/native/mps/operations/RangeFactories.mm +++ b/aten/src/ATen/native/mps/operations/RangeFactories.mm @@ -121,9 +121,7 @@ MPSScalar stepScalar = getMPSScalar(step, result.scalar_type()); feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, stepScalar); - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (!is_contiguous) { @@ -168,7 +166,7 @@ if (numel != size) { result.resize_({size}); } - bool is_contiguous = result.is_contiguous(); + bool is_contiguous = !mps::needsGather(result); Tensor r = !is_contiguous ? at::empty_like(result, LEGACY_CONTIGUOUS_MEMORY_FORMAT) : result; using namespace mps; auto cache_ = MPSGraphCache::getInstance(); @@ -190,9 +188,7 @@ MPSScalar stepScalar = getMPSScalar(step, result.scalar_type()); feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, stepScalar); - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (!is_contiguous) { @@ -259,9 +255,7 @@ MPSScalar multiplyScalar = getMPSScalar(multiply, ScalarType::Float); feeds[cachedGraph->multiplyTensor] = getMPSGraphTensorFromScalar(stream, multiplyScalar); - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } if (!result.is_contiguous()) { diff --git a/aten/src/ATen/native/mps/operations/ReduceOps.mm b/aten/src/ATen/native/mps/operations/ReduceOps.mm index 0fc0ff8f2859d..416c83f0d3b3e 100644 --- a/aten/src/ATen/native/mps/operations/ReduceOps.mm +++ b/aten/src/ATen/native/mps/operations/ReduceOps.mm @@ -30,10 +30,12 @@ #include #include #include +#include #include #include #include #include +#include #include #endif @@ -280,13 +282,8 @@ static void reduction_out_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, mpsShape); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -321,8 +318,8 @@ static void impl_func_norm_mps(const Tensor& input_tensor, auto reciprocal_p = 1 / p; bool pIsZero = (p == 0.0); - bool pIsPosInf = (p == numeric_limits::infinity()); - bool pIsNegInf = (p == -numeric_limits::infinity()); + bool pIsPosInf = (p == std::numeric_limits::infinity()); + bool pIsNegInf = (p == -std::numeric_limits::infinity()); int64_t num_input_dims = input_shape.size(); int64_t num_reduce_dims = dim.size(); @@ -434,10 +431,7 @@ static void impl_func_norm_mps(const Tensor& input_tensor, feeds[otherPlaceholder.getMPSGraphTensor()] = otherPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -606,13 +600,8 @@ static Tensor std_var_common_impl_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_output_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output_t; @@ -661,14 +650,8 @@ static Tensor min_max_mps_impl(const Tensor& input_t, MPSReductionType reduction auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, @[ @1 ]); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output_t; @@ -757,15 +740,8 @@ static void min_max_out_mps(const Tensor& input_t, auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() - }; - + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } } @@ -920,14 +896,8 @@ static void argmax_argmin_out_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t, apparent_in_shape); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1266,15 +1236,8 @@ Tensor std_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1316,15 +1279,8 @@ Tensor std_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1375,15 +1331,8 @@ Tensor std_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1422,15 +1371,8 @@ Tensor std_mps(const Tensor& input_t, auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - }; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -1489,7 +1431,7 @@ Tensor min_mps(const Tensor& input_t) { Tensor median_mps(const Tensor& input_t) { if (!is_macos_13_or_newer()) { TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0. ", - "Falling back on CPU. This may have performace implications."); + "Falling back on CPU. This may have performance implications."); return at::median(input_t.to("cpu")); } @@ -1536,14 +1478,8 @@ Tensor median_mps(const Tensor& input_t) { auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, input_t); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, @[ @1 ]); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } return output_t; @@ -1626,15 +1562,8 @@ static void median_out_mps(const Tensor& input_t, auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output_t, apparent_out_shape); auto indicesPlaceholder = Placeholder(cachedGraph->indicesTensor_, indices_t, apparent_out_shape); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - - NSDictionary* results = @{ - outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData(), - indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() - }; - + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(outputPlaceholder, indicesPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } } @@ -1717,7 +1646,7 @@ static void median_out_mps(const Tensor& input_t, if (!is_macos_13_or_newer()) { TORCH_WARN_ONCE("MPS: median op is supported natively starting from macOS 13.0.", - "Falling back on CPU. This may have performace implications."); + "Falling back on CPU. This may have performance implications."); return median_from_cpu(input_t.to("cpu"), dim, keepdim, @@ -1732,4 +1661,26 @@ static void median_out_mps(const Tensor& input_t, return std::tuple{values, indices}; } +std::tuple std_mean_mps(const Tensor& self, + at::OptionalIntArrayRef dim, + const c10::optional& correction, + bool keepdim) { + // TODO: Refactor it into a proper std_var_mean composite function + auto std = std_mps(self, dim, correction, keepdim); + auto mean = at::empty(std.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous); + mps::reduction_out_mps(self, dim, keepdim, c10::nullopt, mean, mps::MPSReductionType::MEAN, "mean_out_mps"); + return {std, mean}; +} + +std::tuple var_mean_mps(const Tensor& self, + at::OptionalIntArrayRef dim, + const c10::optional& correction, + bool keepdim) { + // TODO: Refactor it into a proper std_var_mean composite function + auto var = var_mps(self, dim, correction, keepdim); + auto mean = at::empty(var.sizes(), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, MemoryFormat::Contiguous); + mps::reduction_out_mps(self, dim, keepdim, c10::nullopt, mean, mps::MPSReductionType::MEAN, "mean_out_mps"); + return {var, mean}; +} + } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/RenormKernel.mm b/aten/src/ATen/native/mps/operations/RenormKernel.mm index c72e9560abba0..c4655baa90eee 100644 --- a/aten/src/ATen/native/mps/operations/RenormKernel.mm +++ b/aten/src/ATen/native/mps/operations/RenormKernel.mm @@ -15,7 +15,9 @@ namespace at::native { namespace { -static const char* METAL_RENORM = R"RENORM_METAL( +using namespace mps; + +static MetalShaderLibrary lib(R"RENORM_METAL( #include using namespace metal; @@ -41,48 +43,7 @@ kernel void renorm(constant T* norm [[buffer(0)]], REGISTER_RENORM_OP(float); REGISTER_RENORM_OP(half); -)RENORM_METAL"; - -using namespace mps; - -static id compileRenormLibrary(id device, const std::string& key) { - static std::unordered_map> libMap; - auto it = libMap.find(key); - if (it != libMap.end()) { - return it->second; - } - - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; -stringWithCString: - id renormLibrary = [device newLibraryWithSource:[NSString stringWithUTF8String:METAL_RENORM] - options:options - error:&error]; - TORCH_CHECK( - renormLibrary, "Failed to to create renorm mps kernel library, error: ", error.localizedDescription.UTF8String); - - libMap[key] = renormLibrary; - return renormLibrary; -} - -static id renormPipelineState(id device, const std::string& key) { - static std::unordered_map> psoCache; - id pso = psoCache[key]; - if (pso) { - return pso; - } - - NSError* error = nil; - id renormLib = compileRenormLibrary(device, key); - id renormFunc = [renormLib newFunctionWithName:[NSString stringWithUTF8String:key.c_str()]]; - TORCH_CHECK(renormFunc, "Failed to create function state object for: ", key); - pso = [device newComputePipelineStateWithFunction:renormFunc error:&error]; - TORCH_CHECK(pso, "Failed to created pipeline state object, error: ", [[error description] UTF8String]); - - psoCache[key] = pso; - return pso; -} +)RENORM_METAL"); void renorm_out_mps(const Tensor& self, const Scalar& p, int64_t dim, const Scalar& maxnorm, const Tensor& out) { auto self_sizes = self.sizes(); @@ -100,10 +61,10 @@ void renorm_out_mps(const Tensor& self, const Scalar& p, int64_t dim, const Scal id normBuffer = getMTLBufferStorage(norm); id factorBuffer = getMTLBufferStorage(factor); - string key = "renorm_" + scalarToMetalTypeString(self.scalar_type()); + string key = "renorm_" + scalarToMetalTypeString(self); MPSStream* mpsStream = getCurrentMPSStream(); id computeEncoder = mpsStream->commandEncoder(); - id renormPSO = renormPipelineState(device, key); + id renormPSO = lib.getPipelineStateForFunc(key); dispatch_sync(mpsStream->queue(), ^() { @autoreleasepool { diff --git a/aten/src/ATen/native/mps/operations/Repeat.mm b/aten/src/ATen/native/mps/operations/Repeat.mm index ac84e98f491f5..db7722b0e63b4 100644 --- a/aten/src/ATen/native/mps/operations/Repeat.mm +++ b/aten/src/ATen/native/mps/operations/Repeat.mm @@ -10,10 +10,6 @@ #include #include -#ifdef __OBJC__ -#include -#endif - namespace at::native { Tensor permute_mps(const Tensor& self, IntArrayRef dims) { @@ -90,18 +86,14 @@ Tensor repeat_mps(const Tensor& self, IntArrayRef repeats) { Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, result, /*mpsShape=*/nil, /*gatherTensorData*/ false, outputDataType); - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; } -static const char* METAL_REPEAT_INTERLEAVE = R"METAL_REPEAT( +static mps::MetalShaderLibrary lib(R"METAL_REPEAT( kernel void repeat_interleave(constant {0} * repeat_ptr [[buffer(0)]], constant int64_t * cumsum_ptr [[buffer(1)]], device {0} * result_ptr [[buffer(2)]], @@ -114,49 +106,12 @@ kernel void repeat_interleave(constant {0} * repeat_ptr [[buf result_ptr[j] = tid; }} }} -)METAL_REPEAT"; - -static id compileRepeatInterleaveLib(id device, const std::string& t1) { - auto key = t1; - static std::unordered_map> libMap; - auto it = libMap.find(key); - if (it != libMap.end()) { - return it->second; - } - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - auto rc = - [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(METAL_REPEAT_INTERLEAVE, t1).c_str()] - options:options - error:&error]; - TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]); - libMap[key] = rc; - return rc; -} - -static id getPipelineState(id device, const std::string& t1) { - static std::string kernel = "repeat_interleave"; - auto key = kernel + t1; - static std::unordered_map> cplMap; - auto it = cplMap.find(key); - if (it != cplMap.end()) { - return it->second; - } - NSError* error = nil; - auto library = compileRepeatInterleaveLib(device, t1); - id func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(func != nil, "Can't get kernel ", kernel); - auto rc = [device newComputePipelineStateWithFunction:func error:&error]; - TORCH_CHECK( - rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]); - cplMap[key] = rc; - return rc; -} +)METAL_REPEAT", + 1); template -void computeRepeatIndices(index_t* repeat_ptr, - int64_t* cumsum_ptr, +void computeRepeatIndices(const index_t* repeat_ptr, + const int64_t* cumsum_ptr, index_t* result_ptr, int64_t size, int64_t result_size) { @@ -178,7 +133,7 @@ void computeRepeatIndices(index_t* repeat_ptr, dispatch_sync(mpsStream->queue(), ^() { @autoreleasepool { id computeEncoder = mpsStream->commandEncoder(); - id pipelineState = getPipelineState(MPSDevice::getInstance()->device(), scalar_type); + id pipelineState = lib.getPipelineStateForFunc("repeat_interleave", {scalar_type}); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(pipelineState, "repeat_interleave:" + scalar_type, false); diff --git a/aten/src/ATen/native/mps/operations/ScatterGather.mm b/aten/src/ATen/native/mps/operations/ScatterGather.mm index 8496a16506f87..a3fc5f690754c 100644 --- a/aten/src/ATen/native/mps/operations/ScatterGather.mm +++ b/aten/src/ATen/native/mps/operations/ScatterGather.mm @@ -102,14 +102,8 @@ Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, nullptr, false, output_type); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, indexPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -299,15 +293,8 @@ static void scatter_mps_general(const Tensor& self_arg, Placeholder indexPlaceholder = Placeholder(cachedGraph->indexTensor_, index, index_shape); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = @{ - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - srcPlaceholder.getMPSGraphTensor() : srcPlaceholder.getMPSGraphTensorData(), - indexPlaceholder.getMPSGraphTensor() : indexPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder, srcPlaceholder, indexPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/Shape.mm b/aten/src/ATen/native/mps/operations/Shape.mm index e6148904aba44..135041be1f41e 100644 --- a/aten/src/ATen/native/mps/operations/Shape.mm +++ b/aten/src/ATen/native/mps/operations/Shape.mm @@ -198,13 +198,8 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values); Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices); // Create dictionary of inputs and outputs - NSDictionary* feeds = nil; - feeds = @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = @{ - valuesPlaceholder.getMPSGraphTensor() : valuesPlaceholder.getMPSGraphTensorData(), - indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() - }; - + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } } @@ -325,8 +320,16 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in }; @autoreleasepool { - string key = "cat_out_mps:" + to_string(dimension) + getTensorsStringKey(input_tensors, /*short_dtype*/ true) + - ":" + (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW"); + string key = + "cat_out_mps:" + to_string(dimension) + ":" + (memory_format == MemoryFormat::ChannelsLast ? "NHWC" : "NCHW"); + if (!all_same_dtype) { + key += getTensorsStringKey(input_tensors, true, all_same_sizes_and_stride); + } else { + key += ":" + getMPSTypeString(input_tensors[0].scalar_type(), true) + ":" + to_string(inputs.size()); + } + for (auto idx : skipped_tensor_indices) { + key += "," + std::to_string(idx); + } auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { auto len_tensor_array = inputs.size() - skipped_tensor_indices.size(); @@ -339,8 +342,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in if (tensor.scalar_type() == kBool) { scalar_type = MPSDataTypeInt8; } - newCachedGraph->inputTensors_[idx] = - mpsGraphRankedPlaceHolder(mpsGraph, scalar_type, getMPSShape(tensor, MemoryFormat::Contiguous)); + newCachedGraph->inputTensors_[idx] = mpsGraphUnrankedPlaceHolder(mpsGraph, scalar_type); if (tensor.scalar_type() != out_dtype) { castInputTensors[idx] = [mpsGraph castTensor:newCachedGraph->inputTensors_[idx] toType:getMPSDataType(out_dtype) @@ -369,11 +371,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in if (tensor.scalar_type() == kBool) { scalar_type = MPSDataTypeInt8; } - inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], - tensor, - getMPSShape(tensor, MemoryFormat::Contiguous), - /*gatherTensorData*/ true, - scalar_type); + inputPlaceholders.emplace_back(cachedGraph->inputTensors_[t_idx], tensor, nullptr, true, scalar_type); t_idx++; } i++; @@ -390,10 +388,7 @@ static void check_shape_except_dim(const Tensor& first, const Tensor& second, in for (auto& inputPlaceholder : inputPlaceholders) { feeds[inputPlaceholder.getMPSGraphTensor()] = inputPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/SoftMax.mm b/aten/src/ATen/native/mps/operations/SoftMax.mm index 92531164fbcff..4687ac6b96a1a 100644 --- a/aten/src/ATen/native/mps/operations/SoftMax.mm +++ b/aten/src/ATen/native/mps/operations/SoftMax.mm @@ -92,10 +92,11 @@ static void get_shapes(MPSShape* input_shape_readonly, NSString* ns_shape_key = [[input_shape valueForKey:@"description"] componentsJoinedByString:@","]; - string key = "softmax_mps_out:" + mem_format_key + ":" + getMPSTypeString(input) + ":" + [ns_shape_key UTF8String] + + string key = "softmax_mps_out" + getTensorsStringKey(input, true, /*exclude_shape*/ true) + ":" + mem_format_key + ":" + std::to_string(dim_); + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - MPSGraphTensor* inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, getMPSDataType(input), input_shape); + MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(input.scalar_type())); // passing selector of softMaxWithTensor on the mpsGraph object MPSGraphTensor* outputTensor = [mpsGraph softMaxWithTensor:inputTensor axis:(NSInteger)dim_ name:nil]; @@ -122,12 +123,8 @@ static void get_shapes(MPSShape* input_shape_readonly, // This must be the Contiguous shape Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = - @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -186,14 +183,8 @@ static void get_shapes(MPSShape* input_shape_readonly, Placeholder gradOutputPlaceholder = Placeholder(cachedGraph->gradOutputTensor_, grad, grad_shape); Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); - NSDictionary* feeds = @{ - softmaxPlaceholder.getMPSGraphTensor() : softmaxPlaceholder.getMPSGraphTensorData(), - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(softmaxPlaceholder, gradOutputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/Sort.mm b/aten/src/ATen/native/mps/operations/Sort.mm index bb12aa657c735..e3ee85cfe230e 100644 --- a/aten/src/ATen/native/mps/operations/Sort.mm +++ b/aten/src/ATen/native/mps/operations/Sort.mm @@ -89,12 +89,8 @@ Placeholder valuesPlaceholder = Placeholder(cachedGraph->valuesTensor, values); Placeholder indicesPlaceholder = Placeholder(cachedGraph->indicesTensor, indices); // Create dictionary of inputs and outputs - NSDictionary* feeds = nil; - feeds = @{inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = @{ - valuesPlaceholder.getMPSGraphTensor() : valuesPlaceholder.getMPSGraphTensorData(), - indicesPlaceholder.getMPSGraphTensor() : indicesPlaceholder.getMPSGraphTensorData() - }; + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + auto results = dictionaryFromPlaceholders(valuesPlaceholder, indicesPlaceholder); runMPSGraph(stream, cachedGraph->graph(), feeds, results); } diff --git a/aten/src/ATen/native/mps/operations/SummaryOps.mm b/aten/src/ATen/native/mps/operations/SummaryOps.mm index 5c65fb3d0a089..34f7fbeae50a2 100644 --- a/aten/src/ATen/native/mps/operations/SummaryOps.mm +++ b/aten/src/ATen/native/mps/operations/SummaryOps.mm @@ -66,11 +66,8 @@ feeds[weightsPlaceholder.getMPSGraphTensor()] = weightsPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - // Run the graph - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return output; diff --git a/aten/src/ATen/native/mps/operations/TensorCompare.mm b/aten/src/ATen/native/mps/operations/TensorCompare.mm index 781d13a27e78e..f378af1326a73 100644 --- a/aten/src/ATen/native/mps/operations/TensorCompare.mm +++ b/aten/src/ATen/native/mps/operations/TensorCompare.mm @@ -12,7 +12,9 @@ #include #include #include +#include #include +#include #include #endif @@ -30,41 +32,53 @@ static void clamp_mps_graph(CachedGraph* cachedGraph, const Tensor& min_tensor, const Tensor& max_tensor) { auto input_dtype = input_tensor.scalar_type(); - auto min_dtype = input_dtype; - auto max_dtype = input_dtype; - if (cachedGraph->minTensor) { - min_dtype = min_tensor.scalar_type(); - } - if (cachedGraph->maxTensor) { - max_dtype = max_tensor.scalar_type(); - } + auto min_dtype = cachedGraph->minTensor ? min_tensor.scalar_type() : input_dtype; + auto max_dtype = cachedGraph->maxTensor ? max_tensor.scalar_type() : input_dtype; MPSGraph* mpsGraph = cachedGraph->graph(); cachedGraph->inputTensor = mpsGraphRankedPlaceHolder(mpsGraph, input_tensor); - MPSGraphTensor* minTensor = cachedGraph->minTensor; - MPSGraphTensor* maxTensor = cachedGraph->maxTensor; + auto minTensor = cachedGraph->minTensor; + auto maxTensor = cachedGraph->maxTensor; + if (input_dtype != min_dtype) { minTensor = castMPSTensor(mpsGraph, cachedGraph->minTensor, input_dtype); } if (input_dtype != max_dtype) { maxTensor = castMPSTensor(mpsGraph, cachedGraph->maxTensor, input_dtype); } - if (cachedGraph->minTensor && cachedGraph->maxTensor) { - cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor - minValueTensor:minTensor - maxValueTensor:maxTensor - name:nil]; - } else if (cachedGraph->maxTensor) { - cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor - secondaryTensor:maxTensor - name:nil]; - } else if (cachedGraph->minTensor) { - cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor - secondaryTensor:minTensor - name:nil]; + if (c10::isIntegralType(input_dtype, /*includeBool=*/true)) { + if (minTensor && maxTensor) { + cachedGraph->outputTensor = [mpsGraph clampWithTensor:cachedGraph->inputTensor + minValueTensor:minTensor + maxValueTensor:maxTensor + name:nil]; + } else if (maxTensor) { + cachedGraph->outputTensor = [mpsGraph minimumWithPrimaryTensor:cachedGraph->inputTensor + secondaryTensor:maxTensor + name:nil]; + } else if (minTensor) { + cachedGraph->outputTensor = [mpsGraph maximumWithPrimaryTensor:cachedGraph->inputTensor + secondaryTensor:minTensor + name:nil]; + } + return; + } + // clampWithTensor doesn't propagate NaN through so simulate it as composition of + // maximumWithNaNPropagationWithPrimaryTensor and minimumWithNaNPropagationWithPrimaryTensor + auto outputTensor = cachedGraph->inputTensor; + if (minTensor) { + outputTensor = [mpsGraph maximumWithNaNPropagationWithPrimaryTensor:outputTensor + secondaryTensor:minTensor + name:nil]; } + if (maxTensor) { + outputTensor = [mpsGraph minimumWithNaNPropagationWithPrimaryTensor:outputTensor + secondaryTensor:maxTensor + name:nil]; + } + cachedGraph->outputTensor = outputTensor; } static void check_min_max_dims(const OptionalTensorRef clamp_opt, const Tensor& input_t, string op_name) { @@ -198,10 +212,7 @@ static void clamp_tensor_out_mps(const Tensor& input_t, feeds[maxPlaceholder.getMPSGraphTensor()] = maxPlaceholder.getMPSGraphTensorData(); } - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -254,13 +265,70 @@ static void clamp_scalar_out_mps(const Tensor& input_t, auto outputPlaceholder = Placeholder(cachedGraph->outputTensor, output_t, /*mpsShape=*/nil, /*gatherTensorData=*/false); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); + } +} - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); +static void isin_Tensor_Tensor_out_mps(const Tensor& elements, + const Tensor& test_elements, + bool assume_unique, + bool invert, + const Tensor& out, + string op_name) { + TORCH_CHECK(is_macos_13_or_newer(MacOSVersion::MACOS_VER_14_0_PLUS), + "isin_Tensor_Tensor_out supported on MPS from MacOs_14_0 onwards"); + if (elements.numel() == 0) { + return; + } + + if (test_elements.numel() == 0) { + if (invert) { + auto ones = ones_like(out); + out.copy_(ones); + } else { + auto zeros = zeros_like(out); + out.copy_(zeros); + } + return; + } + + TORCH_CHECK(elements.is_mps() && test_elements.is_mps()); + TORCH_CHECK(elements.dtype() == test_elements.dtype()); + + @autoreleasepool { + string key = + op_name + getTensorsStringKey({elements}) + getTensorsStringKey({test_elements}) + std::to_string(invert); + + auto cachedGraph = LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { + MPSGraphTensor* inputTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(elements.scalar_type())); + MPSGraphTensor* otherTensor = mpsGraphUnrankedPlaceHolder(mpsGraph, getMPSDataType(test_elements.scalar_type())); + + newCachedGraph->inputTensor_ = inputTensor; + newCachedGraph->otherTensor_ = otherTensor; + + MPSShape* outputShape = getMPSShape(out); + + MPSGraphTensor* input_flattened = [mpsGraph reshapeTensor:inputTensor withShape:@[ @-1, @1 ] name:nil]; + MPSGraphTensor* other_flattened = [mpsGraph reshapeTensor:otherTensor withShape:@[ @1, @-1 ] name:nil]; + MPSGraphTensor* isInTensor = [mpsGraph equalWithPrimaryTensor:input_flattened + secondaryTensor:other_flattened + name:nil]; + MPSGraphTensor* output = [mpsGraph reductionOrWithTensor:isInTensor axis:1 name:nil]; + output = [mpsGraph reshapeTensor:output withShape:outputShape name:nil]; + + if (invert) { + output = [mpsGraph notWithTensor:output name:nil]; + } + newCachedGraph->outputTensor_ = output; + }); + + auto inputPlaceholder = Placeholder(cachedGraph->inputTensor_, elements); + auto otherPlaceholder = Placeholder(cachedGraph->otherTensor_, test_elements); + auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); + + auto feeds = dictionaryFromPlaceholders(inputPlaceholder, otherPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -297,7 +365,16 @@ static void clamp_scalar_out_mps(const Tensor& input_t, mps::clamp_scalar_out_mps(input_t, at::OptionalScalarRef(), max, output_t, __func__); } -Tensor& where_self_out_mps(const Tensor& condition, const Tensor& self, const Tensor& other, Tensor& out) { +TORCH_IMPL_FUNC(isin_Tensor_Tensor_out_mps) +(const Tensor& elements, const Tensor& test_elements, bool assume_unique, bool invert, const Tensor& out) { + mps::isin_Tensor_Tensor_out_mps(elements, test_elements, assume_unique, invert, out, __func__); +} + +static void where_kernel_mps(TensorIterator& iter) { + const auto& condition = iter.input(0); + const auto& self = iter.input(1); + const auto& other = iter.input(2); + auto& out = iter.output(0); TORCH_CHECK(condition.device() == self.device() && self.device() == other.device(), "Expected all tensors to be on the same device, but found at least two devices."); TORCH_CHECK(self.dtype() == other.dtype(), "expected scalar type ", self.dtype(), " but found ", other.dtype()); @@ -316,8 +393,9 @@ static void clamp_scalar_out_mps(const Tensor& input_t, MPSStream* stream = getCurrentMPSStream(); // Empty output - if (out.numel() == 0) - return out; + if (out.numel() == 0) { + return; + } // Derive from MPSCachedGraph struct CachedGraph : public MPSCachedGraph { @@ -372,61 +450,9 @@ static void clamp_scalar_out_mps(const Tensor& input_t, Placeholder(cachedGraph->otherTensor_, other, /*mpsShape=*/nullptr, /*gatherTensorData=*/true, otherDataType); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, out); - NSDictionary* feeds = @{ - conditionPlaceholder.getMPSGraphTensor() : conditionPlaceholder.getMPSGraphTensorData(), - selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData(), - otherPlaceholder.getMPSGraphTensor() : otherPlaceholder.getMPSGraphTensorData() - }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(conditionPlaceholder, selfPlaceholder, otherPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } - - return out; -} - -Tensor where_mps(const Tensor& condition, const Tensor& self, const Tensor& other) { - auto max_dim = std::max(condition.dim(), std::max(self.dim(), other.dim())); - - // How many leading dimensions do we broadcast across for each Tensor? - int cond_num_implicit_ones = (max_dim - condition.dim()); - int self_num_implicit_ones = (max_dim - self.dim()); - int other_num_implicit_ones = (max_dim - other.dim()); - - std::vector out_arr(max_dim); - - // Broadcasted output shape - for (int i = 0; i < max_dim; i++) { - // Use up the leading broadcast dimensions for each Tensor, then continue from the start of the "actual" shape - int64_t cond_idx = i < cond_num_implicit_ones ? 1 : (condition.size(i - cond_num_implicit_ones)); - int64_t self_idx = i < self_num_implicit_ones ? 1 : (self.size(i - self_num_implicit_ones)); - int64_t other_idx = i < other_num_implicit_ones ? 1 : (other.size(i - other_num_implicit_ones)); - - auto max_idx = std::max({cond_idx, self_idx, other_idx}); - - TORCH_CHECK(cond_idx == max_idx || cond_idx == 1 || (cond_idx == 0 && max_idx == 1), - i, - "'th index ", - cond_idx, - " of condition tensor does not match the other tensors") - TORCH_CHECK(self_idx == max_idx || self_idx == 1 || (self_idx == 0 && max_idx == 1), - i, - "'th index ", - self_idx, - " of x tensor does not match the other tensors") - TORCH_CHECK(other_idx == max_idx || other_idx == 1 || (other_idx == 0 && max_idx == 1), - i, - "'th index ", - other_idx, - " of x tensor does not match the other tensors") - - out_arr[i] = (cond_idx == 0 || self_idx == 0 || other_idx == 0) ? 0 : max_idx; - } - - Tensor ret = at::empty( - IntArrayRef(out_arr), self.scalar_type(), c10::nullopt, kMPS, c10::nullopt, self.suggest_memory_format()); - return where_self_out_mps(condition, self, other, ret); } Tensor& nan_to_num_out_mps(const Tensor& self, @@ -520,11 +546,11 @@ Tensor where_mps(const Tensor& condition, const Tensor& self, const Tensor& othe cachedGraph->posInfReplacementTensor : getMPSGraphTensorFromScalar(stream, posInfReplacementScalar), cachedGraph->negInfReplacementTensor : getMPSGraphTensorFromScalar(stream, negInfReplacementScalar), }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } return result; } +REGISTER_DISPATCH(where_kernel, &where_kernel_mps); + } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/TriangularOps.mm b/aten/src/ATen/native/mps/operations/TriangularOps.mm index 89eb5c3c37fa1..5fa0b22184535 100644 --- a/aten/src/ATen/native/mps/operations/TriangularOps.mm +++ b/aten/src/ATen/native/mps/operations/TriangularOps.mm @@ -55,12 +55,8 @@ Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } @@ -105,12 +101,8 @@ Placeholder selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self); Placeholder outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output); - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); } } diff --git a/aten/src/ATen/native/mps/operations/UnaryKernel.mm b/aten/src/ATen/native/mps/operations/UnaryKernel.mm index 4998d90e46871..540fc6a26cd81 100644 --- a/aten/src/ATen/native/mps/operations/UnaryKernel.mm +++ b/aten/src/ATen/native/mps/operations/UnaryKernel.mm @@ -39,45 +39,7 @@ return getMetalType(t.scalar_type()); } -static id compileUnaryOpsLibrary(id device, const std::string& t1, const std::string& t2) { - auto key = t1 + t2; - static std::unordered_map> libMap; - auto it = libMap.find(key); - if (it != libMap.end()) { - return it->second; - } - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - auto rc = - [device newLibraryWithSource:[NSString stringWithUTF8String:fmt::format(UNARY_KERNEL_TEMPLATE, t1, t2).c_str()] - options:options - error:&error]; - TORCH_CHECK(rc != nil && error == nil, "Failed to compile library: ", [[error localizedDescription] UTF8String]); - libMap[key] = rc; - return rc; -} - -static id getCPLState(id device, - const std::string& t1, - const std::string& t2, - const std::string& fname) { - auto key = t1 + t2 + fname; - static std::unordered_map> cplMap; - auto it = cplMap.find(key); - if (it != cplMap.end()) { - return it->second; - } - NSError* error = nil; - auto library = compileUnaryOpsLibrary(device, t1, t2); - id func = [library newFunctionWithName:[NSString stringWithUTF8String:fname.c_str()]]; - TORCH_CHECK(func != nil, "Can't get function ", fname); - auto rc = [device newComputePipelineStateWithFunction:func error:&error]; - TORCH_CHECK( - rc != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]); - cplMap[key] = rc; - return rc; -} +static mps::MetalShaderLibrary lib(UNARY_KERNEL_TEMPLATE, 2); TORCH_IMPL_FUNC(erfinv_out_mps)(const Tensor& self, const Tensor& output_) { // handle erfinv ops using metal kernel @@ -95,9 +57,7 @@ } using namespace mps; @autoreleasepool { - id device = MPSDevice::getInstance()->device(); - id cplState = - getCPLState(device, getMetalType(outputTensor), getMetalType(self), "erfinv_mps_kernel"); + auto cplState = lib.getPipelineStateForFunc("erfinv_mps_kernel", {getMetalType(outputTensor), getMetalType(self)}); if (!self.is_contiguous()) { inputTensor = inputTensor.contiguous(); diff --git a/aten/src/ATen/native/mps/operations/UnaryOps.mm b/aten/src/ATen/native/mps/operations/UnaryOps.mm index 970c0ee77e868..46709f2489e7d 100644 --- a/aten/src/ATen/native/mps/operations/UnaryOps.mm +++ b/aten/src/ATen/native/mps/operations/UnaryOps.mm @@ -1,6 +1,7 @@ // Copyright Ā© 2022 Apple Inc. #define TORCH_ASSERT_ONLY_METHOD_OPERATORS #include +#include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -28,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -35,7 +38,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -70,26 +75,13 @@ static bool is_empty_tensor(const Tensor& self) { return self.numel() == 0; } -static void unary_op(const Tensor& self, - const Tensor& output_, - std::string op_name, - UnaryOpBlock unaryBlock, - is_noop_p is_noop = is_empty_tensor) { +static void unary_op_noresize(const Tensor& self, const Tensor& output_, std::string op_name, UnaryOpBlock unaryBlock) { TORCH_CHECK(!(!is_macos_13_or_newer() && self.scalar_type() == ScalarType::Byte), "MPS support unary op with uint8 natively starting from macOS 13.0"); - if (!output_.is_same_size(self)) { - output_.resize_(self.sizes()); - } - - if (is_noop(self)) { - output_.copy_(self); - return; - } - auto output = output_; bool needsCopyToOutput = false; - if (output.storage_offset() || !output.is_contiguous()) { + if (needsGather(output)) { output = at::empty(output.sizes(), output.scalar_type(), c10::nullopt, kMPS, c10::nullopt, c10::nullopt); needsCopyToOutput = true; } @@ -125,11 +117,8 @@ static void unary_op(const Tensor& self, auto selfPlaceholder = Placeholder(cachedGraph->inputTensor_, self_, /*mpsShape=*/nullptr, gatherTensorData); auto outputPlaceholder = Placeholder(cachedGraph->outputTensor_, output, /*mpsShape=*/nullptr, false); - NSDictionary* feeds = - @{selfPlaceholder.getMPSGraphTensor() : selfPlaceholder.getMPSGraphTensorData()}; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(selfPlaceholder); + runMPSGraph(getCurrentMPSStream(), cachedGraph->graph(), feeds, outputPlaceholder); if (needsCopyToOutput) { output_.copy_(output); @@ -137,6 +126,23 @@ static void unary_op(const Tensor& self, } } +static void unary_op(const Tensor& self, + const Tensor& output_, + std::string op_name, + UnaryOpBlock unaryBlock, + is_noop_p is_noop = is_empty_tensor) { + if (!output_.is_same_size(self)) { + output_.resize_(self.sizes()); + } + + if (is_noop(self)) { + output_.copy_(self); + return; + } + + unary_op_noresize(self, output_, op_name, unaryBlock); +} + MPSGraphTensor* trunc_tensor(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { // Rounding is a no-op for integral types, and also a reasonable workaround // For MPSGraph bug on Apple Silicon, that throws `Function floorOp_i64 was not found in the library` @@ -166,6 +172,12 @@ static void unary_op(const Tensor& self, return [mpsGraph logarithmWithTensor:addedTensor name:nil]; } +static MPSGraphTensor* lengthOfComplexAsReal(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + auto squares = [mpsGraph squareWithTensor:inputTensor name:nil]; + auto sumSquares = [mpsGraph reductionSumWithTensor:squares axis:-1 name:nil]; + return [mpsGraph squareRootWithTensor:sumSquares name:nil]; +} + } // namespace mps TORCH_IMPL_FUNC(trunc_out_mps)(const Tensor& self, const Tensor& output) { @@ -224,14 +236,6 @@ static void unary_op(const Tensor& self, }); \ } -#define CREATE_MPS_UNARY_TORCH_IMPL_FUNC(func_out, func_stub) \ - Tensor& func_out(const Tensor& self, Tensor& output) { \ - mps::unary_op(self, output, #func_out, ^MPSGraphTensor*(MPSGraph * mpsGraph, MPSGraphTensor * inputTensor) { \ - return [mpsGraph func_stub##WithTensor:inputTensor name:nil]; \ - }); \ - return output; \ - } - CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp_out_mps, exponent) CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(exp2_out_mps, exponentBase2) CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(reciprocal_out_mps, reciprocal) @@ -255,7 +259,35 @@ static void unary_op(const Tensor& self, CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(acosh_out_mps, acosh) CREATE_MPS_STRUCTURED_UNARY_TORCH_IMPL_FUNC(atanh_out_mps, atanh) -CREATE_MPS_UNARY_TORCH_IMPL_FUNC(abs_out_mps, absolute) +Tensor& abs_out_mps(const Tensor& self, Tensor& output) { + using namespace mps; + + if (!output.is_same_size(self)) { + output.resize_(self.sizes()); + } + + if (self.numel() == 0) { + return output; + } + + if (supportsComplex() || !self.is_complex()) { + unary_op_noresize(self, output, "abs_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + auto rc = [mpsGraph absoluteWithTensor:inputTensor name:nil]; + if (self.is_complex()) { + rc = [mpsGraph realPartOfTensor:rc name:nil]; + } + return rc; + }); + } else { + Tensor realInput = at::view_as_real(self); + unary_op_noresize( + realInput, output, "abs_out_mps", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + auto rc = lengthOfComplexAsReal(mpsGraph, inputTensor); + return [mpsGraph reshapeTensor:rc withShape:getMPSShape(output) name:nil]; + }); + } + return output; +} Tensor& logical_not_out_mps(const Tensor& self, Tensor& output) { auto bool_self = self.to(ScalarType::Bool); @@ -394,13 +426,8 @@ Tensor logit_mps(const Tensor& self, c10::optional eps) { Placeholder gradInputPlaceholder = Placeholder(cachedGraph->gradInputTensor_, grad_input); // Create dictionary of inputs and outputs - NSDictionary* feeds = @{ - gradOutputPlaceholder.getMPSGraphTensor() : gradOutputPlaceholder.getMPSGraphTensorData(), - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; - NSDictionary* results = - @{gradInputPlaceholder.getMPSGraphTensor() : gradInputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + auto feeds = dictionaryFromPlaceholders(gradOutputPlaceholder, inputPlaceholder); + runMPSGraph(stream, cachedGraph->graph(), feeds, gradInputPlaceholder); } } @@ -437,7 +464,7 @@ static void cumulative_op_impl(const Tensor& self, // issue #103810551: cumsum / cumprod are broken for int8, int16 and as chances for overflow are pretty high, cast to // int32 fixed in macOS 13.3 - bool castInputData = (isIntegralType(input.scalar_type(), false) && input.scalar_type() != ScalarType::Int && + bool castInputData = (isIntegralType(input.scalar_type(), true) && input.scalar_type() != ScalarType::Int && input.scalar_type() != ScalarType::Long); TORCH_CHECK(macOS13_3_plus || input.scalar_type() != ScalarType::Long, @@ -487,9 +514,7 @@ static void cumulative_op_impl(const Tensor& self, Tensor realOutput = at::view_as_real(output); auto complex_sgn_op = [&](MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) -> MPSGraphTensor* { - MPSGraphTensor* squares = [mpsGraph squareWithTensor:inputTensor name:nil]; - MPSGraphTensor* sumSquares = [mpsGraph reductionSumWithTensor:squares axis:-1 name:nil]; - MPSGraphTensor* norm = [mpsGraph squareRootWithTensor:sumSquares name:nil]; + MPSGraphTensor* norm = mps::lengthOfComplexAsReal(mpsGraph, inputTensor); MPSGraphTensor* zero = [mpsGraph constantWithScalar:0.0 dataType:norm.dataType]; MPSGraphTensor* isZero = [mpsGraph equalWithPrimaryTensor:norm secondaryTensor:zero name:nil]; MPSGraphTensor* sgnTensor = [mpsGraph divisionWithPrimaryTensor:inputTensor secondaryTensor:norm name:nil]; @@ -499,4 +524,20 @@ static void cumulative_op_impl(const Tensor& self, mps::unary_op(realInput, realOutput, "sgn_out_mps", complex_sgn_op); } +Tensor& conj_physical_out_mps(const Tensor& self, Tensor& result) { + TORCH_CHECK(self.is_complex()); + if (!mps::supportsComplex()) { + if (!result.is_same_size(self)) { + result.resize_(self.sizes()); + } + at::real(result).copy_(at::real(self)); + at::imag(result).copy_(at::neg(at::imag(self))); + } else { + mps::unary_op(self, result, "conj", ^MPSGraphTensor*(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor) { + return [mpsGraph conjugateWithTensor:inputTensor name:nil]; + }); + } + return result; +} + } // namespace at::native diff --git a/aten/src/ATen/native/mps/operations/Unique.mm b/aten/src/ATen/native/mps/operations/Unique.mm index cda420da5c03b..fc30c2d0b797c 100644 --- a/aten/src/ATen/native/mps/operations/Unique.mm +++ b/aten/src/ATen/native/mps/operations/Unique.mm @@ -107,7 +107,7 @@ name:nil]; MPSGraphTensor* mask = [graph castTensor:notEqualToPreviousElement toType:MPSDataTypeInt32 name:@"castMaskTensor"]; - // If comparing tensors, not scalars, check if entire tensor matches previos element using reductionOr over tensor + // If comparing tensors, not scalars, check if entire tensor matches previous element using reductionOr over tensor if (dimOpt.has_value() && [shape count] != 1) { NSMutableArray* axes = [[NSMutableArray alloc] initWithCapacity:[shape count] - 1]; for (const auto axis : c10::irange([shape count])) { @@ -186,11 +186,7 @@ @autoreleasepool { string key = getUniqueKey(self.scalar_type(), self.sizes(), return_inverse, return_counts, consecutive, dim); return LookUpOrCreateCachedGraph(key, [&](auto mpsGraph, auto newCachedGraph) { - // Workaround for MPSShaderLibrary bug - // TODO: Remove once https://github.com/pytorch/pytorch/issues/82305 is resolved - auto inputType = getMPSScalarType(self.scalar_type()); - newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, inputType, getMPSShape(self.sizes())); - + newCachedGraph->inputTensor_ = mpsGraphRankedPlaceHolder(mpsGraph, getMPSScalarType(self), getMPSShape(self)); auto outputTensors = buildUniqueGraph(self, newCachedGraph, return_inverse, return_counts, consecutive, dim); newCachedGraph->outputTensor_ = outputTensors[0]; @@ -210,9 +206,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph, bool return_inverse, bool return_counts) { Placeholder inputPlaceholder = Placeholder(uniqueGraph->inputTensor_, input); - NSDictionary* feeds = @{ - inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), - }; + auto feeds = dictionaryFromPlaceholders(inputPlaceholder); NSMutableDictionary* results = [NSMutableDictionary dictionary]; Placeholder outputPlaceholder = Placeholder(uniqueGraph->outputTensor_, output); @@ -285,7 +279,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph, } static std::tuple castToMPS(std::tuple out) { - return std::make_tuple(get<0>(out).to("mps"), get<1>(out).to("mps"), get<2>(out).to("mps")); + return std::make_tuple(std::get<0>(out).to("mps"), std::get<1>(out).to("mps"), std::get<2>(out).to("mps")); } std::tuple unique_consecutive_mps(const Tensor& self, @@ -294,7 +288,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph, c10::optional dim) { if (!is_macos_13_or_newer()) { TORCH_WARN_ONCE("MPS: unique_consecutive op is supported natively starting from macOS 13.0. ", - "Falling back on CPU. This may have performace implications."); + "Falling back on CPU. This may have performance implications."); return castToMPS(at::unique_consecutive(self.to("cpu"), return_inverse, return_counts, dim)); } @@ -307,7 +301,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph, const bool return_counts) { if (!is_macos_13_or_newer()) { TORCH_WARN_ONCE("MPS: unique_dim_consecutive op is supported natively starting from macOS 13.0. ", - "Falling back on CPU. This may have performace implications."); + "Falling back on CPU. This may have performance implications."); return castToMPS(at::unique_dim_consecutive(self.to("cpu"), dim, return_inverse, return_counts)); } @@ -320,7 +314,7 @@ static void runUniqueGraph(UniqueCachedGraph* uniqueGraph, const bool return_counts) { if (!is_macos_13_or_newer()) { TORCH_WARN_ONCE("MPS: _unique2 op is supported natively starting from macOS 13.0. ", - "Falling back on CPU. This may have performace implications."); + "Falling back on CPU. This may have performance implications."); return castToMPS(at::_unique2(self.to("cpu"), sorted, return_inverse, return_counts)); } diff --git a/aten/src/ATen/native/mps/operations/UpSample.mm b/aten/src/ATen/native/mps/operations/UpSample.mm index 64fe89b7f539d..f4973f6000156 100644 --- a/aten/src/ATen/native/mps/operations/UpSample.mm +++ b/aten/src/ATen/native/mps/operations/UpSample.mm @@ -20,6 +20,10 @@ #include #include #include +#include +#include +#include +#include #include #include #include @@ -36,9 +40,9 @@ // supported resize_mode: 'nearest' | 'bilinear' | 'nearest-exact' static void upsample_out_template(const Tensor& input, IntArrayRef output_size, - c10::optional input_size_opt, // only used for backward pass - c10::optional scale_h_opt, - c10::optional scale_w_opt, + std::optional input_size_opt, // only used for backward pass + std::optional scale_h_opt, + std::optional scale_w_opt, const Tensor& output, bool align_corners, const c10::string_view resize_mode_str) { @@ -225,9 +229,7 @@ static void upsample_out_template(const Tensor& input, inputPlaceholder.getMPSGraphTensor() : inputPlaceholder.getMPSGraphTensorData(), cachedGraph->outputSizeTensor : sizeTensorData, }; - NSDictionary* results = - @{outputPlaceholder.getMPSGraphTensor() : outputPlaceholder.getMPSGraphTensorData()}; - runMPSGraph(stream, cachedGraph->graph(), feeds, results); + runMPSGraph(stream, cachedGraph->graph(), feeds, outputPlaceholder); if (out.has_storage()) { output.copy_(out); @@ -237,7 +239,7 @@ static void upsample_out_template(const Tensor& input, } // namespace mps -static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10::optional scale) { +static bool check_mps_compatibility(const c10::string_view resize_mode_str, std::optional scale) { static const bool is_macOS_13_0_or_newer = is_macos_13_or_newer(); if (!is_macOS_13_0_or_newer) { // passing scale factors to MPS's resize APIs is not supported on macOS < 13 @@ -260,7 +262,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: } TORCH_IMPL_FUNC(upsample_nearest1d_out_mps) -(const Tensor& input, IntArrayRef output_size, c10::optional scale, const Tensor& output) { +(const Tensor& input, IntArrayRef output_size, std::optional scale, const Tensor& output) { if (check_mps_compatibility("nearest", scale)) { mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest"); } else { @@ -272,7 +274,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: (const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scale, + std::optional scale, const Tensor& grad_input) { if (check_mps_compatibility("nearest", scale)) { mps::upsample_out_template(grad_output, output_size, input_size, c10::nullopt, scale, grad_input, false, "nearest"); @@ -282,7 +284,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: } TORCH_IMPL_FUNC(_upsample_nearest_exact1d_out_mps) -(const Tensor& input, IntArrayRef output_size, c10::optional scale, const Tensor& output) { +(const Tensor& input, IntArrayRef output_size, std::optional scale, const Tensor& output) { if (check_mps_compatibility("nearest-exact", scale)) { mps::upsample_out_template(input, output_size, c10::nullopt, c10::nullopt, scale, output, false, "nearest-exact"); } else { @@ -294,7 +296,7 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: (const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scale, + std::optional scale, const Tensor& grad_input) { if (check_mps_compatibility("nearest-exact", scale)) { mps::upsample_out_template( @@ -307,8 +309,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: TORCH_IMPL_FUNC(upsample_nearest2d_out_mps) (const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { if (check_mps_compatibility("nearest", scales_w)) { mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest"); @@ -321,8 +323,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: (const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { if (check_mps_compatibility("nearest", scales_w)) { mps::upsample_out_template(grad_output, output_size, input_size, scales_h, scales_w, grad_input, false, "nearest"); @@ -335,8 +337,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: TORCH_IMPL_FUNC(_upsample_nearest_exact2d_out_mps) (const Tensor& input, IntArrayRef output_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { if (check_mps_compatibility("nearest-exact", scales_w)) { mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, false, "nearest-exact"); @@ -349,8 +351,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: (const Tensor& grad_output, IntArrayRef output_size, IntArrayRef input_size, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { if (check_mps_compatibility("nearest-exact", scales_w)) { mps::upsample_out_template( @@ -361,12 +363,38 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: } } +TORCH_IMPL_FUNC(upsample_linear1d_out_mps) +(const Tensor& input, IntArrayRef output_size, bool align_corners, std::optional scale, const Tensor& output) { + if (check_mps_compatibility("bilinear", scale)) { + mps::upsample_out_template( + input, output_size, c10::nullopt, c10::nullopt, scale, output, align_corners, "bilinear"); + } else { + output.copy_(at::upsample_linear1d(input.to("cpu"), output_size, align_corners, scale)); + } +} + +TORCH_IMPL_FUNC(upsample_linear1d_backward_out_mps) +(const Tensor& grad_output, + IntArrayRef output_size, + IntArrayRef input_size, + bool align_corners, + std::optional scale, + const Tensor& grad_input) { + if (check_mps_compatibility("bilinear", scale)) { + mps::upsample_out_template( + grad_output, output_size, input_size, c10::nullopt, scale, grad_input, align_corners, "bilinear"); + } else { + grad_input.copy_( + at::upsample_linear1d_backward(grad_output.to("cpu"), output_size, input_size, align_corners, scale)); + } +} + TORCH_IMPL_FUNC(upsample_bilinear2d_out_mps) (const Tensor& input, IntArrayRef output_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& output) { if (check_mps_compatibility("bilinear", scales_w)) { mps::upsample_out_template(input, output_size, c10::nullopt, scales_h, scales_w, output, align_corners, "bilinear"); @@ -380,8 +408,8 @@ static bool check_mps_compatibility(const c10::string_view resize_mode_str, c10: IntArrayRef output_size, IntArrayRef input_size, bool align_corners, - c10::optional scales_h, - c10::optional scales_w, + std::optional scales_h, + std::optional scales_w, const Tensor& grad_input) { if (check_mps_compatibility("bilinear", scales_w)) { mps::upsample_out_template( diff --git a/aten/src/ATen/native/mps/operations/View.mm b/aten/src/ATen/native/mps/operations/View.mm index 0d276c4a0c076..b583a19ef5e61 100644 --- a/aten/src/ATen/native/mps/operations/View.mm +++ b/aten/src/ATen/native/mps/operations/View.mm @@ -4,6 +4,8 @@ #include #include #include +// For MTLLanguageVersion_3_1 +#include #include #include @@ -91,7 +93,7 @@ MPSGraphTensorData* outputTensorData = [[[MPSGraphTensorData alloc] initWithMTLBuffer:outputBuffer shape:outputShape dataType:outputType] autorelease]; - NSDictionary* results = @{cachedGraph->outputTensor : outputTensorData}; + auto results = @{cachedGraph->outputTensor : outputTensorData}; runMPSGraph(stream, cachedGraph->graph(), feeds, results); } return output; @@ -732,6 +734,7 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self) { static std::unordered_map scalarToMetalType = { {c10::ScalarType::Float, "float"}, {c10::ScalarType::Half, "half"}, + {c10::ScalarType::BFloat16, "bfloat"}, {c10::ScalarType::Long, "long"}, {c10::ScalarType::Int, "int"}, {c10::ScalarType::Short, "short"}, @@ -747,66 +750,36 @@ static IntArrayRef updateTensorBaseShape(const Tensor& self) { return it->second; } -static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const std::string& dtypeDst) { +static std::string genScatterGatherCvtFunc(const std::string& dtypeSrc, const std::string& dtypeDst, bool needsConj) { const bool srcComplex = dtypeSrc[dtypeSrc.size() - 1] == '2'; const bool dstComplex = dtypeDst[dtypeDst.size() - 1] == '2'; if (dstComplex) { - return dtypeDst + (srcComplex ? "(x.x, x.y)" : "(x, 0.0)"); + return dtypeDst + (srcComplex ? needsConj ? "(x.x, -x.y)" : "(x.x, x.y)" : "(x, 0.0)"); } if (srcComplex) { + // TODO: Document why explicit cast is needed only for bfloat types + if (dtypeDst == "bfloat") { + return "bfloat(x.x)"; + } return "x.x"; } - return "x"; + // TODO: Document why explicit cast is needed only for bfloat types + if (dtypeDst == "bfloat") { + return "bfloat(x)"; + } + return "(x)"; } -static id compileGatherScatterOpsLibrary(id device, - const std::string& dtypeSrc, - const std::string& dtypeDst, - bool needsScatter) { - auto key = std::to_string(needsScatter) + dtypeSrc + dtypeDst; - static std::unordered_map> _libCache; - auto it = _libCache.find(key); - if (it != _libCache.end()) { - return it->second; - } - NSError* error = nil; - MTLCompileOptions* options = [[MTLCompileOptions new] autorelease]; - [options setLanguageVersion:MTLLanguageVersion2_3]; - const auto shaderStr = fmt::format(needsScatter ? SCATTER_OPS_TEMPLATE : GATHER_OPS_TEMPLATE, - dtypeSrc, - dtypeDst, - genScatterGatherCvtFunc(dtypeSrc, dtypeDst)); - auto gatherScatterLib = [device newLibraryWithSource:[NSString stringWithUTF8String:shaderStr.c_str()] - options:options - error:&error]; - TORCH_CHECK(gatherScatterLib != nil && error == nil, - "Failed to compile gather-scatter library, error: ", - [[error description] UTF8String]); - _libCache[key] = gatherScatterLib; - return gatherScatterLib; -} +static MetalShaderLibrary scatterLib(SCATTER_OPS_TEMPLATE, 3); +static MetalShaderLibrary gatherLib(GATHER_OPS_TEMPLATE, 3); -static id getPipelineState(id device, - const std::string& kernel, +static id getPipelineState(const std::string& kernel, const std::string& dtypeSrc, const std::string& dtypeDst, - bool needsScatter) { - auto key = kernel + dtypeSrc + dtypeDst; - static std::unordered_map> _mtlPipelineCache; - auto it = _mtlPipelineCache.find(key); - if (it != _mtlPipelineCache.end()) { - return it->second; - } - - NSError* error = nil; - id library = compileGatherScatterOpsLibrary(device, dtypeSrc, dtypeDst, needsScatter); - id func = [library newFunctionWithName:[NSString stringWithUTF8String:kernel.c_str()]]; - TORCH_CHECK(func, "Failed to load the Metal Shader function: ", kernel); - id pso = [device newComputePipelineStateWithFunction:func error:&error]; - TORCH_CHECK( - pso != nil && error == nil, "Failed to construct pipeline state: ", [[error localizedDescription] UTF8String]); - _mtlPipelineCache[key] = pso; - return pso; + bool needsScatter, + bool needsConj) { + auto cvtFunc = genScatterGatherCvtFunc(dtypeSrc, dtypeDst, needsConj); + return (needsScatter ? scatterLib : gatherLib).getPipelineStateForFunc(kernel, {dtypeSrc, dtypeDst, cvtFunc}); } Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) { @@ -831,11 +804,11 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) { dispatch_sync_with_rethrow(mpsStream->queue(), ^() { id computeEncoder = mpsStream->commandEncoder(); std::string functionName = getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/false); - id gatherPSO = getPipelineState(MPSDevice::getInstance()->device(), - functionName, + id gatherPSO = getPipelineState(functionName, getGatherScatterScalarType(src), getGatherScatterScalarType(output), - /*needsScatter=*/false); + /*needsScatter=*/false, + src.is_conj() != dst.is_conj()); // this function call is a no-op if MPS Profiler is not enabled getMPSProfiler().beginProfileKernel(gatherPSO, functionName, {src, output}); @@ -888,11 +861,11 @@ Tensor gatherViewTensor(const at::Tensor& src, at::Tensor& dst) { id computeEncoder = mpsStream->commandEncoder(); std::string functionName = getGatherScatterFunctionName(output.scalar_type(), output.dim(), /*needsScatter=*/true); - id scatterPSO = getPipelineState(MPSDevice::getInstance()->device(), - functionName, + id scatterPSO = getPipelineState(functionName, getGatherScatterScalarType(src), getGatherScatterScalarType(output), - /*needsScatter=*/true); + /*needsScatter=*/true, + src.is_conj() != output.is_conj()); getMPSProfiler().beginProfileKernel(scatterPSO, functionName, {src, output}); diff --git a/aten/src/ATen/native/mps/operations/WeightNorm.mm b/aten/src/ATen/native/mps/operations/WeightNorm.mm index 7ca63533ed19b..6cc20cfa01aa4 100644 --- a/aten/src/ATen/native/mps/operations/WeightNorm.mm +++ b/aten/src/ATen/native/mps/operations/WeightNorm.mm @@ -76,16 +76,8 @@ Placeholder norms_placeholder = Placeholder(cachedGraph->norms_, norms); Placeholder w_placeholder = Placeholder(cachedGraph->w_, w); - NSDictionary* feeds = @{ - v_placeholder.getMPSGraphTensor() : v_placeholder.getMPSGraphTensorData(), - g_placeholder.getMPSGraphTensor() : g_placeholder.getMPSGraphTensorData() - }; - - NSDictionary* results = @{ - norms_placeholder.getMPSGraphTensor() : norms_placeholder.getMPSGraphTensorData(), - w_placeholder.getMPSGraphTensor() : w_placeholder.getMPSGraphTensorData() - }; - + auto feeds = dictionaryFromPlaceholders(v_placeholder, g_placeholder); + auto results = dictionaryFromPlaceholders(norms_placeholder, w_placeholder); runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results); } @@ -171,18 +163,8 @@ Placeholder grad_g_placeholder = Placeholder(cachedGraph->grad_g, grad_g); Placeholder grad_v_placeholder = Placeholder(cachedGraph->grad_v, grad_v); - NSDictionary* feeds = @{ - grad_w_placeholder.getMPSGraphTensor() : grad_w_placeholder.getMPSGraphTensorData(), - norms_placeholder.getMPSGraphTensor() : norms_placeholder.getMPSGraphTensorData(), - v_placeholder.getMPSGraphTensor() : v_placeholder.getMPSGraphTensorData(), - g_placeholder.getMPSGraphTensor() : g_placeholder.getMPSGraphTensorData() - }; - - NSDictionary* results = @{ - grad_g_placeholder.getMPSGraphTensor() : grad_g_placeholder.getMPSGraphTensorData(), - grad_v_placeholder.getMPSGraphTensor() : grad_v_placeholder.getMPSGraphTensorData() - }; - + auto feeds = dictionaryFromPlaceholders(grad_w_placeholder, norms_placeholder, v_placeholder, g_placeholder); + auto results = dictionaryFromPlaceholders(grad_g_placeholder, grad_v_placeholder); runMPSGraph(mpsStream, cachedGraph->graph(), feeds, results); } diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 4960417abdbef..b75bc85bbed53 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -134,7 +134,7 @@ autogen: _new_zeros_with_same_feature_meta.out # This function compares the storage numel of self with that of other, where -# storage numel is cumputed as: `other.storage().nbytes() / other.itemsize()`. +# storage numel is computed as: `other.storage().nbytes() / other.itemsize()`. # We create this function for composite compliance purposes. The batching rule # always returns true because vmapped as_strided does not support accessing # storage locations not indexable by the input tensor. @@ -189,6 +189,10 @@ - func: _assert_tensor_metadata(Tensor a, SymInt[]? size=None, SymInt[]? stride=None, ScalarType? dtype=None) -> () +- func: _print(str s) -> () + dispatch: + CompositeExplicitAutograd: _print + - func: sym_constrain_range(Scalar size, *, int? min=None, int? max=None) -> () dispatch: CompositeExplicitAutograd: sym_constrain_range @@ -478,6 +482,7 @@ - func: conj_physical.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) dispatch: CPU, CUDA: conj_physical_out + MPS: conj_physical_out_mps SparseCPU, SparseCUDA: conj_physical_out_sparse SparseCsrCPU, SparseCsrCUDA: conj_physical_sparse_csr_out tags: pointwise @@ -544,8 +549,8 @@ structured_delegate: add.out variants: function, method dispatch: - SparseCPU, SparseCUDA: add_sparse - SparseCsrCPU, SparseCsrCUDA: add_sparse_csr + SparseCPU, SparseCUDA, SparseMeta: add_sparse + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr MkldnnCPU: mkldnn_add ZeroTensor: add_zerotensor NestedTensorCPU, NestedTensorCUDA: NestedTensor_add_Tensor @@ -556,8 +561,8 @@ variants: method structured_delegate: add.out dispatch: - SparseCPU, SparseCUDA: add_sparse_ - SparseCsrCPU, SparseCsrCUDA: add_sparse_csr_ + SparseCPU, SparseCUDA, SparseMeta: add_sparse_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: add_sparse_csr_ MkldnnCPU: mkldnn_add_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_add__Tensor tags: pointwise @@ -570,9 +575,9 @@ Generic: add (AllAndComplex, BFloat16, Half, ComplexHalf) ScalarOnly: add (Bool) dispatch: - SparseCPU: add_out_sparse_cpu + SparseCPU, SparseMeta: add_out_sparse_cpu SparseCUDA: add_out_sparse_cuda - SparseCsrCPU: add_out_sparse_compressed_cpu + SparseCsrCPU, SparseCsrMeta: add_out_sparse_compressed_cpu SparseCsrCUDA: add_out_sparse_compressed_cuda MkldnnCPU: mkldnn_add_out MPS: add_out_mps @@ -771,7 +776,7 @@ dispatch: CompositeExplicitAutograd: arange -# This operator should be named `aragne.start_out` if following the naming convention. However that +# This operator should be named `arange.start_out` if following the naming convention. However that # name is already taken. Disabled because of CI job failures. # FIXME: enable this #- func: arange.start_out_(Scalar start, Scalar end, *, Tensor(a!) out) -> Tensor(a!) @@ -1228,6 +1233,13 @@ CompositeExplicitAutograd: copysign_out tags: pointwise +- func: _lazy_clone(Tensor self) -> Tensor + # Like clone, but the copy takes place lazily, only if either the + # input or the output are written. + variants: function, method + dispatch: + CompositeExplicitAutograd: _lazy_clone + - func: logical_not(Tensor self) -> Tensor device_check: NoCheck # TensorIterator variants: function, method @@ -2358,7 +2370,7 @@ Meta: empty_meta_symint MkldnnCPU: empty_mkldnn SparseCPU, SparseCUDA, SparseMeta: empty_sparse - SparseCsrCPU, SparseCsrCUDA: empty_sparse_compressed + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_sparse_compressed QuantizedCPU, QuantizedCUDA, QuantizedMeta: empty_unknown_quantized tags: core @@ -2464,7 +2476,7 @@ CompositeExplicitAutograd: empty_like QuantizedCPU, QuantizedCUDA: empty_like_quantized SparseCPU, SparseCUDA, SparseMeta: empty_like_sparse_coo - SparseCsrCPU, SparseCsrCUDA: empty_like_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: empty_like_sparse_csr NestedTensorCPU, NestedTensorCUDA: empty_like_nested autogen: empty_like.out @@ -2966,12 +2978,14 @@ dispatch: CPU: _fft_r2c_mkl CUDA: _fft_r2c_cufft + MPS: _fft_r2c_mps - func: _fft_r2c.out(Tensor self, int[] dim, int normalization, bool onesided, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_r2c_mkl_out CUDA: _fft_r2c_cufft_out + MPS: _fft_r2c_mps_out # Complex to real inverse FFT - func: _fft_c2r(Tensor self, int[] dim, int normalization, SymInt last_dim_size) -> Tensor @@ -2979,12 +2993,14 @@ dispatch: CPU: _fft_c2r_mkl CUDA: _fft_c2r_cufft + MPS: _fft_c2r_mps - func: _fft_c2r.out(Tensor self, int[] dim, int normalization, SymInt last_dim_size, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_c2r_mkl_out CUDA: _fft_c2r_cufft_out + MPS: _fft_c2r_mps_out # Standard complex to complex FFT (forward or backward) - func: _fft_c2c(Tensor self, SymInt[] dim, int normalization, bool forward) -> Tensor @@ -2992,12 +3008,14 @@ dispatch: CPU: _fft_c2c_mkl CUDA: _fft_c2c_cufft + MPS: _fft_c2c_mps - func: _fft_c2c.out(Tensor self, SymInt[] dim, int normalization, bool forward, *, Tensor(a!) out) -> Tensor(a!) variants: function dispatch: CPU: _fft_c2c_mkl_out CUDA: _fft_c2c_cufft_out + MPS: _fft_c2c_mps_out - func: _validate_compressed_sparse_indices(bool is_crow, Tensor compressed_idx, Tensor plain_idx, int cdim, int dim, int nnz) -> () device_check: NoCheck @@ -3109,6 +3127,7 @@ structured: True dispatch: CPU, CUDA: isin_Tensor_Tensor_out + MPS: isin_Tensor_Tensor_out_mps - func: isin.Tensor_Tensor(Tensor elements, Tensor test_elements, *, bool assume_unique=False, bool invert=False) -> Tensor variants: function @@ -3250,6 +3269,8 @@ autogen: native_layer_norm_backward.out tags: core +- func: rms_norm(Tensor input, int[] normalized_shape, Tensor? weight=None, float? eps=None) -> Tensor + - func: nan_to_num(Tensor self, float? nan=None, float? posinf=None, float? neginf=None) -> Tensor variants: function, method dispatch: @@ -3322,10 +3343,31 @@ dispatch: CUDA: _cslt_sparse_mm_search +- func: _sparse_semi_structured_tile(Tensor input, str algorithm="", bool use_cutlass=True) -> (Tensor, Tensor, Tensor, Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_tile + +- func: _sparse_semi_structured_apply(Tensor input, Tensor thread_masks) -> (Tensor, Tensor) + dispatch: + CUDA: _sparse_semi_structured_apply + +- func: _sparse_semi_structured_apply_dense(Tensor input, Tensor thread_masks) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_apply_dense + +# DEPRECATED: Use torch.__sparse_semi_structured_mm/torch._sparse_semi_structured_addmm instead - func: _sparse_semi_structured_linear(Tensor input, Tensor weight, Tensor meta, *, Tensor? bias=None, str? activation=None, ScalarType? out_dtype=None) -> Tensor dispatch: CUDA: _sparse_semi_structured_linear +- func: _sparse_semi_structured_mm(Tensor mat1, Tensor mat1_meta, Tensor mat2, *, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_mm + +- func: _sparse_semi_structured_addmm(Tensor input, Tensor mat1, Tensor mat1_meta, Tensor mat2, *, Scalar alpha=1, Scalar beta=1, ScalarType? out_dtype=None) -> Tensor + dispatch: + CUDA: _sparse_semi_structured_addmm + - func: _mixed_dtypes_linear(Tensor input, Tensor weight, Tensor scale, *, Tensor? bias=None, str? activation=None) -> Tensor dispatch: CUDA: _mixed_dtypes_linear @@ -4066,20 +4108,30 @@ - func: _int_mm(Tensor self, Tensor mat2) -> Tensor dispatch: + CPU: _int_mm_cpu CUDA: _int_mm_cuda - func: _int_mm.out(Tensor self, Tensor mat2, *, Tensor(a!) out) -> Tensor(a!) dispatch: + CPU: _int_mm_out_cpu CUDA: _int_mm_out_cuda - func: _convert_weight_to_int4pack(Tensor self, int innerKTiles) -> Tensor dispatch: + CPU: _convert_weight_to_int4pack_cpu CUDA: _convert_weight_to_int4pack_cuda - func: _weight_int4pack_mm(Tensor self, Tensor mat2, int qGroupSize, Tensor qScaleAndZeros) -> Tensor dispatch: + CPU: _weight_int4pack_mm_cpu + MPS: _weight_int4pack_mm_mps CUDA: _weight_int4pack_mm_cuda +- func: _weight_int8pack_mm(Tensor self, Tensor mat2, Tensor scales) -> Tensor + dispatch: + CPU: _weight_int8pack_mm_cpu + MPS: _weight_int8pack_mm_mps + - func: _sparse_mm(Tensor sparse, Tensor dense) -> Tensor python_module: sparse @@ -4455,7 +4507,6 @@ MPS: pixel_shuffle_mps CompositeExplicitAutogradNonFunctional: math_pixel_shuffle autogen: pixel_shuffle.out - tags: core - func: pixel_unshuffle(Tensor self, int downscale_factor) -> Tensor dispatch: @@ -4826,7 +4877,7 @@ device_guard: False dispatch: CompositeImplicitAutograd: reshape_symint - CompositeImplicitAutogradNestedTensor: reshape_nested + CompositeImplicitAutogradNestedTensor: reshape_nested_symint - func: _reshape_copy(Tensor self, SymInt[] size) -> Tensor variants: function @@ -4985,6 +5036,7 @@ device_check: NoCheck # TensorIterator python_module: nn dispatch: + QuantizedCPU: gelu_quantized_cpu_ NestedTensorCPU, NestedTensorCUDA: NestedTensor_gelu_ - func: gelu(Tensor self, *, str approximate='none') -> Tensor @@ -5372,6 +5424,21 @@ CompositeExplicitAutograd: slice_backward autogen: slice_backward.out +# NB: This op exists to back the implementation of reverse view_funcs for various views (chunk, +# slice.Tensor, split_with_sizes, et. al.). Currently, these are only used during fake-ification +# of PT2 graph input subclass instances that are views. This means: +# * This op shouldn't really show up in eager mode (so e.g. XLA shouldn't have to implement it) +# * This op shouldn't show up in a PT2 graph (so a PT2 backend shouldn't have to implement it) +# * A subclass will have to implement this to work in PT2 if a subclass view is used as a graph +# input AND the view utilizes this op in its inverse. The idea is that slice_inverse() is +# easier to implement for a subclass than as_strided() +- func: slice_inverse(Tensor(a) self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor(a) + variants: function, method + device_check: NoCheck + device_guard: False + dispatch: + CompositeExplicitAutograd: slice_inverse_symint + - func: slice_scatter(Tensor self, Tensor src, int dim=0, SymInt? start=None, SymInt? end=None, SymInt step=1) -> Tensor variants: function, method device_check: NoCheck @@ -5379,7 +5446,7 @@ dispatch: CompositeExplicitAutogradNonFunctional: slice_scatter autogen: slice_scatter.out - tags: core + tags: [core, view_copy] - func: select_scatter(Tensor self, Tensor src, int dim, SymInt index) -> Tensor variants: function, method @@ -5578,6 +5645,16 @@ SparseCPU: _sspaddmm_out_cpu SparseCUDA: _sspaddmm_out_cuda +- func: _chunk_cat(Tensor[] tensors, int dim, int num_chunks) -> Tensor + dispatch: + CompositeExplicitAutograd: _chunk_cat + CUDA: _chunk_cat_cuda + +- func: _chunk_cat.out(Tensor[] tensors, int dim, int num_chunks, *, Tensor(a!) out) -> Tensor(a!) + dispatch: + CompositeExplicitAutograd: _chunk_cat_out + CUDA: _chunk_cat_out_cuda + - func: stack(Tensor[] tensors, int dim=0) -> Tensor dispatch: CompositeExplicitAutograd: stack @@ -5642,8 +5719,8 @@ variants: function, method dispatch: CompositeExplicitAutograd: sum - SparseCPU, SparseCUDA: sum_coo - SparseCsrCPU, SparseCsrCUDA: sum_csr + SparseCPU, SparseCUDA, SparseMeta: sum_coo + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sum_csr autogen: sum.out - func: sum.dim_IntList(Tensor self, int[1]? dim, bool keepdim=False, *, ScalarType? dtype=None) -> Tensor @@ -5769,6 +5846,7 @@ variants: function dispatch: CPU, CUDA: std_mean + MPS: std_mean_mps autogen: std_mean.correction_out - func: std_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) @@ -6024,7 +6102,6 @@ CPU, MPS: roll CUDA: roll_cuda autogen: roll.out - tags: core # default int[] value [0,1] should not add space after comma, since codegen parser uses ', ' to split args @@ -6107,6 +6184,58 @@ CompositeExplicitAutogradNonFunctional: _nested_view_from_buffer_copy autogen: _nested_view_from_buffer_copy.out +- func: _nested_view_from_jagged(Tensor(a) self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_view_from_jagged_copy(Tensor self, Tensor offsets, Tensor dummy, Tensor? lengths=None, int ragged_idx=1) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_view_from_jagged_copy + autogen: _nested_view_from_jagged_copy.out + +- func: _nested_get_values(Tensor(a) self) -> Tensor(a) + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_values_copy(Tensor self) -> Tensor + variants: function + device_check: NoCheck + tags: view_copy + dispatch: + CompositeExplicitAutogradNonFunctional: _nested_get_values_copy + autogen: _nested_get_values_copy.out + +- func: _nested_get_offsets(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +# returns undefined Tensor if no lengths present +- func: _nested_get_lengths(Tensor self) -> Tensor + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_ragged_idx(Tensor self) -> int + variants: function + device_check: NoCheck + dispatch: {} + +- func: _nested_get_jagged_dummy(Tensor any) -> Tensor + category_override: dummy + dispatch: {} + +- func: _nested_compute_contiguous_strides_offsets(Tensor nested_size) -> (Tensor, Tensor) + variants: function + device_check: NoCheck + dispatch: + CPU, CUDA: _nested_compute_contiguous_strides_offsets + - func: _trilinear(Tensor i1, Tensor i2, Tensor i3, int[] expand1, int[] expand2, int[] expand3, int[] sumdim, int unroll_dim=1) -> Tensor dispatch: # calls unsqueeze @@ -6291,6 +6420,7 @@ variants: function dispatch: CPU, CUDA: var_mean + MPS: var_mean_mps autogen: var_mean.correction_out - func: var_mean.names_dim(Tensor self, Dimname[1] dim, bool unbiased=True, bool keepdim=False) -> (Tensor, Tensor) @@ -6311,15 +6441,13 @@ device_check: NoCheck # TensorIterator variants: function, method dispatch: - CPU, CUDA: where - MPS: where_mps + CPU, CUDA, MPS: where tags: [core, pointwise] - func: where.self_out(Tensor condition, Tensor self, Tensor other, *, Tensor(a!) out) -> Tensor(a!) device_check: NoCheck # TensorIterator dispatch: - CPU, CUDA: where_self_out - MPS: where_self_out_mps + CPU, CUDA, MPS: where_self_out - func: where.ScalarSelf(Tensor condition, Scalar self, Tensor other) -> Tensor variants: function @@ -6373,7 +6501,7 @@ CPU: _efficientzerotensor CUDA: _efficientzerotensor_cuda MPS: _efficientzerotensor_mps - Meta: _efficientzerotensor_meta + Meta: _efficientzerotensor_meta_symint autogen: _efficientzerotensor.out - func: zeros(SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=None) -> Tensor @@ -6450,6 +6578,32 @@ SparseCPU, SparseCUDA: norm_sparse autogen: native_norm.ScalarOpt_dim_dtype_out +- func: _batch_norm_with_update(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CPU: _batch_norm_with_update_cpu + CUDA: _batch_norm_with_update_cuda + MPS: _batch_norm_with_update_mps + MkldnnCPU: _batch_norm_with_update_mkldnn + autogen: _batch_norm_with_update_functional + +- func: _batch_norm_with_update.out(Tensor input, Tensor? weight, Tensor? bias, Tensor(a!) running_mean, Tensor(b!) running_var, float momentum, float eps, *, Tensor(d!) out, Tensor(e!) save_mean, Tensor(f!) save_invstd, Tensor(g!) reserve) -> (Tensor(d!), Tensor(e!), Tensor(f!), Tensor(g!)) + dispatch: + CPU: _batch_norm_with_update_cpu_out + CUDA: _batch_norm_with_update_cuda_out + MPS: _batch_norm_with_update_mps_out + +- func: _batch_norm_no_update(Tensor input, Tensor? weight, Tensor? bias, Tensor? running_mean, Tensor? running_var, float momentum, float eps) -> (Tensor, Tensor, Tensor, Tensor) + dispatch: + CompositeExplicitAutograd: _batch_norm_no_update + autogen: _batch_norm_no_update.out + +- func: batch_norm_backward(Tensor grad_out, Tensor input, Tensor weight, Tensor? running_mean, Tensor? running_var, Tensor? save_mean, Tensor? save_var, bool update, float eps, bool[3] output_mask, Tensor reserve) -> (Tensor, Tensor, Tensor) + dispatch: + CPU: _new_batch_norm_backward_cpu + CUDA: _new_batch_norm_backward_cuda + MPS: _new_batch_norm_backward_mps + MkldnnCPU: _new_batch_norm_backward_mkldnn + # TODO: reduce signatures down to one when optional args is available - func: _sparse_sum(Tensor self) -> Tensor @@ -6660,7 +6814,7 @@ MPS: zero_mps_ Meta: zero_meta_ SparseCPU, SparseCUDA, SparseMeta: zero_sparse_ - SparseCsrCPU, SparseCsrCUDA: zero_sparse_csr_ + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: zero_sparse_csr_ MkldnnCPU: mkldnn_zero_ NestedTensorCPU, NestedTensorCUDA: zero_nested_ autogen: zero, zero.out @@ -6950,6 +7104,10 @@ # FIXME: would be nicer if TensorOptions was optional based; not adding default arguments for options given # the default would never make sense. +- func: _sparse_compressed_tensor_with_dims(int nnz, int dense_dim, int[] size, int[] blocksize, ScalarType index_dtype, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor + dispatch: + CompositeExplicitAutograd: sparse_compressed_tensor_with_dims + - func: sparse_compressed_tensor.comp_plain_value_size(Tensor compressed_indices, Tensor plain_indices, Tensor values, SymInt[] size, *, ScalarType? dtype=None, Layout? layout=None, Device? device=None, bool? pin_memory=False) -> Tensor dispatch: CompositeExplicitAutograd: sparse_compressed_tensor @@ -7054,9 +7212,9 @@ - func: sparse_dim(Tensor self) -> int variants: method dispatch: - CPU, CUDA: sparse_dim_strided SparseCPU, SparseCUDA, SparseMeta: sparse_dim_sparse - SparseCsrCPU, SparseCsrCUDA: sparse_dim_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: sparse_dim_sparse_csr + CompositeExplicitAutograd: sparse_dim_default device_check: NoCheck device_guard: False @@ -7071,9 +7229,9 @@ - func: dense_dim(Tensor self) -> int variants: method dispatch: - CPU, CUDA: dense_dim_strided SparseCPU, SparseCUDA, SparseMeta: dense_dim_sparse - SparseCsrCPU, SparseCsrCUDA: dense_dim_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: dense_dim_sparse_csr + CompositeExplicitAutograd: dense_dim_default device_check: NoCheck device_guard: False @@ -7089,7 +7247,7 @@ variants: method dispatch: SparseCPU, SparseCUDA, SparseMeta: _nnz_sparse - SparseCsrCPU, SparseCsrCUDA: _nnz_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: _nnz_sparse_csr device_check: NoCheck device_guard: False @@ -7152,7 +7310,7 @@ variants: method dispatch: SparseCPU, SparseCUDA, SparseMeta: values_sparse - SparseCsrCPU, SparseCsrCUDA: values_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: values_sparse_csr NestedTensorCPU, NestedTensorCUDA: values_nested CompositeExplicitAutograd: values_default device_check: NoCheck @@ -7161,7 +7319,7 @@ - func: crow_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: crow_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: crow_indices_sparse_csr CompositeExplicitAutograd: crow_indices_default device_check: NoCheck device_guard: False @@ -7169,7 +7327,7 @@ - func: col_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: col_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: col_indices_sparse_csr CompositeExplicitAutograd: col_indices_default device_check: NoCheck device_guard: False @@ -7177,7 +7335,7 @@ - func: ccol_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: ccol_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: ccol_indices_sparse_csr CompositeExplicitAutograd: ccol_indices_default device_check: NoCheck device_guard: False @@ -7185,7 +7343,7 @@ - func: row_indices(Tensor(a) self) -> Tensor(a) variants: method dispatch: - SparseCsrCPU, SparseCsrCUDA: row_indices_sparse_csr + SparseCsrCPU, SparseCsrCUDA, SparseCsrMeta: row_indices_sparse_csr CompositeExplicitAutograd: row_indices_default device_check: NoCheck device_guard: False @@ -7204,7 +7362,7 @@ device_check: NoCheck # Allows copy into different device variants: function dispatch: - SparseCPU, SparseCUDA: copy_sparse_ + SparseCPU, SparseCUDA, SparseMeta: copy_sparse_ autogen: copy_sparse_to_sparse, copy_sparse_to_sparse.out # By adding the AutogradNestedTensor this makes this function CompositeImplicit-like for nested tensors @@ -7307,7 +7465,7 @@ MkldnnCPU: mkldnn_reorder_conv2d_weight autogen: mkldnn_reorder_conv2d_weight.out -- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1) -> Tensor +- func: mkldnn_reorder_conv3d_weight(Tensor self, SymInt[3] padding=0, SymInt[3] stride=1, SymInt[3] dilation=1, SymInt groups=1, SymInt[]? input_size=None) -> Tensor variants: function python_module: nn dispatch: @@ -7694,6 +7852,7 @@ dispatch: CPU, CUDA, Meta, MPS: set_ autogen: set.source_Storage, set.source_Storage_out + tags: inplace_view - func: set_.source_Storage_storage_offset(Tensor(a!) self, Storage source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) variants: method @@ -7706,6 +7865,7 @@ MPS: set_storage_mps_ QuantizedCPU, QuantizedCUDA: set_storage_quantized_ autogen: set.source_Storage_storage_offset, set.source_Storage_storage_offset_out + tags: inplace_view - func: set_.source_Tensor_storage_offset(Tensor(a!) self, Tensor source, SymInt storage_offset, SymInt[] size, SymInt[] stride=[]) -> Tensor(a!) variants: method @@ -7713,6 +7873,7 @@ device_guard: False dispatch: CompositeImplicitAutograd: set__symint + tags: inplace_view - func: set_.source_Tensor(Tensor(a!) self, Tensor source) -> Tensor(a!) variants: method @@ -7721,6 +7882,7 @@ dispatch: CPU, CUDA, Meta, MPS: set_tensor_ autogen: set.source_Tensor, set.source_Tensor_out + tags: inplace_view - func: set_(Tensor(a!) self) -> Tensor(a!) variants: method @@ -7730,6 +7892,7 @@ Meta: set_meta_ MPS: set_mps_ autogen: set, set.out + tags: inplace_view # Not making it CompositeImplicitAutograd because lift # should be a primitive w.r.t. functorch @@ -10125,18 +10288,21 @@ variants: method, function dispatch: CompositeExplicitAutograd: alias + NestedTensorCPU, NestedTensorCUDA: alias_nested tags: core - func: _amp_foreach_non_finite_check_and_unscale_(Tensor(a!)[] self, Tensor(b!) found_inf, Tensor inv_scale) -> () variants: function dispatch: CUDA: _amp_foreach_non_finite_check_and_unscale_cuda_ + CPU: _amp_foreach_non_finite_check_and_unscale_cpu_ autogen: _amp_foreach_non_finite_check_and_unscale, _amp_foreach_non_finite_check_and_unscale.out - func: _amp_update_scale_(Tensor(a!) self, Tensor(b!) growth_tracker, Tensor found_inf, float scale_growth_factor, float scale_backoff_factor, int growth_interval) -> Tensor(a!) variants: function dispatch: CUDA: _amp_update_scale_cuda_ + CPU: _amp_update_scale_cpu_ autogen: _amp_update_scale, _amp_update_scale.out #- func: _cat(Tensor[] tensors, int dim=0) -> Tensor @@ -12360,6 +12526,7 @@ dispatch: CPU: upsample_linear1d_out_cpu CUDA: upsample_linear1d_out_cuda + MPS: upsample_linear1d_out_mps - func: upsample_linear1d(Tensor self, SymInt[1] output_size, bool align_corners, float? scales=None) -> Tensor python_module: nn @@ -12371,6 +12538,7 @@ dispatch: CPU: upsample_linear1d_backward_out_cpu CUDA: upsample_linear1d_backward_out_cuda + MPS: upsample_linear1d_backward_out_mps - func: upsample_linear1d_backward(Tensor grad_output, SymInt[1] output_size, SymInt[3] input_size, bool align_corners, float? scales=None) -> Tensor python_module: nn @@ -12843,7 +13011,7 @@ SparseMeta: isinf_sparse_meta SparseCsrCPU, SparseCsrCUDA: isinf_sparse_csr autogen: isinf.out - tags: core + tags: [core, pointwise] - func: record_stream(Tensor(a!) self, Stream s) -> () variants: method @@ -13769,11 +13937,18 @@ dispatch: CPU, CUDA: linalg_eig_out +- func: _linalg_eigvals(Tensor self) -> Tensor + python_module: linalg + dispatch: + CPU, CUDA: _linalg_eigvals + - func: linalg_eigvals(Tensor self) -> Tensor python_module: linalg - func: linalg_eigvals.out(Tensor self, *, Tensor(a!) out) -> Tensor(a!) python_module: linalg + dispatch: + CPU, CUDA: linalg_eigvals_out # This function is exposes the `compute_v` flag, which is then used to implement `linalg.eigh` and # `linalg.eigvalsh` as composite functions that call this one @@ -14077,6 +14252,12 @@ # It is undocumented and should not be used outside of tests. - func: _test_serialization_subcmul(Tensor self, Tensor other, Scalar alpha=1) -> Tensor +# Note: for testing COW materialization within `at::parallel_for` loop function +- func: _test_parallel_materialize(Tensor self, int num_parallel, bool skip_first=False) -> Tensor + variants: function + dispatch: + CompositeExplicitAutograd: _test_parallel_materialize + # Note: this function is only for testing. - func: _test_optional_intlist(Tensor values, int[]? addends) -> Tensor python_module: nn @@ -14411,6 +14592,7 @@ variants: function dispatch: CompositeExplicitAutograd: split_with_sizes_copy_out + CUDA: split_with_sizes_copy_out_cuda - func: view_copy(Tensor self, SymInt[] size) -> Tensor variants: function @@ -14521,6 +14703,16 @@ CUDA: _scaled_dot_product_efficient_attention_backward_cuda tags: nondeterministic_seeded +- func: _scaled_dot_product_cudnn_attention(Tensor query, Tensor key, Tensor value, float dropout_p=0.0, bool is_causal=False, bool return_debug_mask=False, *, float? scale=None) -> (Tensor output, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_cuda + tags: nondeterministic_seeded + +- func: _scaled_dot_product_cudnn_attention_backward(Tensor grad_out, Tensor query, Tensor key, Tensor value, Tensor out, Tensor logsumexp, Tensor cum_seq_q, Tensor cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, Tensor philox_seed, Tensor philox_offset, *, float? scale=None) -> (Tensor, Tensor, Tensor) + dispatch: + CUDA: _scaled_dot_product_cudnn_attention_backward_cuda + tags: nondeterministic_seeded + - func: _flash_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? cum_seq_q, Tensor? cum_seq_k, SymInt max_q, SymInt max_k, float dropout_p, bool is_causal, bool return_debug_mask, *, float? scale=None) -> (Tensor output, Tensor softmax_logsumexp, Tensor philox_seed, Tensor philox_offset, Tensor debug_attn_mask) variants: function dispatch: @@ -14533,14 +14725,14 @@ dispatch: CUDA: _flash_attention_backward -# Returns ouput, logsumexp if compute_logsumexp -- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, int? max_seqlen_q, int? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) +# Returns output, logsumexp if compute_logsumexp +- func: _efficient_attention_forward(Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt? max_seqlen_q, SymInt? max_seqlen_k, float dropout_p, int custom_mask_type, bool compute_log_sumexp=False, *, float? scale=None, Tensor? causal_diagonal=None, Tensor? seqlen_k=None, int? window_size=None) -> (Tensor output, Tensor logsumexp, Tensor philox_seed, Tensor philox_offset, SymInt max_seqlen_batch_q, SymInt max_seqlen_batch_k) variants: function dispatch: CUDA: _efficient_attention_forward tags: nondeterministic_seeded -- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None) -> (Tensor, Tensor, Tensor, Tensor) +- func: _efficient_attention_backward(Tensor grad_out_, Tensor query, Tensor key, Tensor value, Tensor? bias, Tensor out, Tensor? cu_seqlens_q, Tensor? cu_seqlens_k, SymInt max_seqlen_q, SymInt max_seqlen_k, Tensor logsumexp, float dropout_p, Tensor philox_seed, Tensor philox_offset, int custom_mask_type, bool bias_requires_grad, *, float? scale=None, int? num_splits_key=None, int? window_size=None) -> (Tensor, Tensor, Tensor, Tensor) device_check: NoCheck variants: function dispatch: @@ -15345,6 +15537,7 @@ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: + CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ autogen: _fused_adam, _fused_adam.out @@ -15354,6 +15547,7 @@ device_check: NoCheck variants: function dispatch: + CPU: _fused_adam_kernel_cpu_ CUDA: _fused_adam_kernel_cuda_ autogen: _fused_adam.tensor_lr, _fused_adam.tensor_lr_out @@ -15361,6 +15555,7 @@ # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). variants: function dispatch: + CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw, _fused_adamw.out @@ -15370,9 +15565,28 @@ device_check: NoCheck variants: function dispatch: + CPU: _fused_adamw_kernel_cpu_ CUDA: _fused_adamw_kernel_cuda_ autogen: _fused_adamw.tensor_lr, _fused_adamw.tensor_lr_out +- func: _fused_sgd_(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, float lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + variants: function + dispatch: + CPU: _fused_sgd_kernel_cpu_ + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd, _fused_sgd.out + +- func: _fused_sgd_.tensor_lr(Tensor(a!)[] self, Tensor(b!)[] grads, Tensor(c!)[] momentum_buffer_list, *, float weight_decay, float momentum, Tensor lr, float dampening, bool nesterov, bool maximize, bool is_first_step, Tensor? grad_scale=None, Tensor? found_inf=None) -> () + # Unlike "foreach" functions, lists of tensors should be guaranteed to be on the same device (for now). + # but still skip the device check as the Tensor LR can be on CPU + device_check: NoCheck + variants: function + dispatch: + CPU: _fused_sgd_kernel_cpu_ + CUDA: _fused_sgd_kernel_cuda_ + autogen: _fused_sgd.tensor_lr, _fused_sgd.tensor_lr_out + # This op is ONLY used by pytorch/XLA in functionalization, and should never show up in vanilla eager mode or in any pytorch tracing contexts. - func: _propagate_xla_data(Tensor input, Tensor output) -> () variants: function diff --git a/aten/src/ATen/native/nested/NestedTensorBackward.cpp b/aten/src/ATen/native/nested/NestedTensorBackward.cpp index ef992a37c8688..e4465b792c21e 100644 --- a/aten/src/ATen/native/nested/NestedTensorBackward.cpp +++ b/aten/src/ATen/native/nested/NestedTensorBackward.cpp @@ -44,16 +44,16 @@ std::tuple nested_linear_backward( return std::tuple{Tensor(), Tensor(), Tensor()}; } Tensor grad_input, grad_weight, grad_bias; - auto grad_ouput_contiguous = grad_output.contiguous(); - auto* nt_grad_output = get_nested_tensor_impl(grad_ouput_contiguous); + auto grad_output_contiguous = grad_output.contiguous(); + auto* nt_grad_output = get_nested_tensor_impl(grad_output_contiguous); auto* nt_input = get_nested_tensor_impl(input); TORCH_INTERNAL_ASSERT(nt_grad_output != nullptr); TORCH_INTERNAL_ASSERT(nt_input != nullptr); TORCH_INTERNAL_ASSERT(nested_tensor_impl_is_contiguous(nt_grad_output)); - auto grad_ouput_buffer = nt_grad_output->get_buffer(); + auto grad_output_buffer = nt_grad_output->get_buffer(); auto input_buffer = nt_input->get_buffer(); - auto reshaped_grad = grad_ouput_buffer.reshape({-1, weight.size(0)}); + auto reshaped_grad = grad_output_buffer.reshape({-1, weight.size(0)}); if (output_mask[0]) { auto grad_input_buffer = at::mm(reshaped_grad, weight).view({-1}); @@ -137,7 +137,7 @@ Tensor _nested_sum_backward_cpu( AT_DISPATCH_ALL_TYPES_AND2( ScalarType::Half, ScalarType::BFloat16, self_grad_buffer.scalar_type(), "nested_sum_dim_cpu", [&]() { auto* self_grad_data = self_grad_buffer.data_ptr(); - const auto* output_grad_data = grad_buffer.data_ptr(); + const auto* output_grad_data = grad_buffer.const_data_ptr(); int64_t out_idx = 0, in_idx = 0; for (const auto i : c10::irange(ntensors)) { int64_t segments = num_segments[i].item(); diff --git a/aten/src/ATen/native/nested/NestedTensorFactories.cpp b/aten/src/ATen/native/nested/NestedTensorFactories.cpp index eaf3a1c88834c..45425ed63315c 100644 --- a/aten/src/ATen/native/nested/NestedTensorFactories.cpp +++ b/aten/src/ATen/native/nested/NestedTensorFactories.cpp @@ -230,5 +230,20 @@ Tensor narrow_nested_symint(const at::Tensor& self, int64_t dim, SymInt start, S storage_offsets); } +Tensor alias_nested(const Tensor& self) { + auto* nt_impl = get_nested_tensor_impl(self); + const at::Tensor& buffer = nt_impl->get_unsafe_storage_as_tensor(); + const auto& nested_sizes = nt_impl->get_nested_sizes(); + const auto& nested_strides = nt_impl->get_nested_strides(); + const auto& storage_offsets = nt_impl->get_storage_offsets(); + return at::detail::make_tensor( + c10::TensorImpl::VIEW, + std::move(buffer), + std::move(nested_sizes), + std::move(nested_strides), + std::move(storage_offsets)); +} + + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/nested/NestedTensorMath.cpp b/aten/src/ATen/native/nested/NestedTensorMath.cpp index c4bc824fdb3cf..7d3e826ef53e9 100644 --- a/aten/src/ATen/native/nested/NestedTensorMath.cpp +++ b/aten/src/ATen/native/nested/NestedTensorMath.cpp @@ -403,7 +403,7 @@ Tensor NestedTensor_sum_dim_CPU( AT_DISPATCH_ALL_TYPES_AND2( ScalarType::Half, ScalarType::BFloat16, buffer.scalar_type(), "nested_sum_dim_cpu", [&]() { auto* output_data = output_buffer.data_ptr(); - const auto* input_data = buffer.data_ptr(); + const auto* input_data = buffer.const_data_ptr(); int64_t out_idx = 0, in_idx = 0; for (const auto i : c10::irange(ntensors)) { int64_t segments = num_segments[i].item(); @@ -680,7 +680,7 @@ inline std::tuple NestedTensor_compute_size_stride( std::vector size_reshaped_vector(proposed_shape.begin() + 1, proposed_shape.end()); // only allow one pre-existing dimension to have proposed shape == -1 int64_t infer_index_old = -1; - // some negative sizes remain to be infered + // some negative sizes remain to be inferred if (ndims_underlying < ndims_underlying_reshaped) { int64_t numel = 1, numel_reshaped = 1; // replace negative sizes for old dimensions with old sizes @@ -770,7 +770,7 @@ inline std::tuple NestedTensor_compute_size_stride( } // namespace // Note [Special size rule for nested tensor] -// Instead of infering size, -1 means "inherit the old size", so: +// Instead of inferring size, -1 means "inherit the old size", so: // * negative size is legal for a ragged dimension // * however, we only allow one -1 // In principle we could still infer a dimension, @@ -861,6 +861,12 @@ Tensor _nested_view_from_buffer( storage_offsets); } +std::tuple _nested_compute_contiguous_strides_offsets(const Tensor& nested_size) { + return std::make_tuple( + construct_nested_strides(nested_size), + construct_offsets(nested_size)); +} + // See Note [Special size rule for nested tensor] Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) { TORCH_CHECK( @@ -894,7 +900,26 @@ Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape) { } } +Tensor reshape_nested_symint(const Tensor& self, SymIntArrayRef proposed_shape) { + // Jagged layout NT decomp + if (self.layout() == at::kJagged) { + // TODO: Expand decomp to handle other viewable cases + bool viewable = self.is_contiguous(); + return ( + viewable ? self.view_symint(proposed_shape) : + self.clone(at::MemoryFormat::Contiguous).view_symint(proposed_shape) + ); + } + + return reshape_nested(self, C10_AS_INTARRAYREF_SLOW(proposed_shape)); +} + Tensor reshape_as_nested(const Tensor& self, const Tensor& other) { + // Jagged layout NT decomp + if (self.layout() == at::kJagged) { + return self.reshape_symint(other.sym_sizes()); + } + auto other_ptr = get_nested_tensor_impl(other); // TODO: this is to reproduce other_ptr->opt_sizes_ // if an accessor is provided in the future, can replace this diff --git a/aten/src/ATen/native/nested/NestedTensorMath.h b/aten/src/ATen/native/nested/NestedTensorMath.h index 8269985990966..068cc6b51ee70 100644 --- a/aten/src/ATen/native/nested/NestedTensorMath.h +++ b/aten/src/ATen/native/nested/NestedTensorMath.h @@ -75,5 +75,7 @@ C10_ALWAYS_INLINE std::pair _check_nested_layer_norm_inputs( return std::make_pair(M, N); } +Tensor reshape_nested(const Tensor& self, IntArrayRef proposed_shape); + } // namespace native } // namespace at diff --git a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp index 8b937f16b0cf3..88e2a94570185 100644 --- a/aten/src/ATen/native/nested/NestedTensorMatmul.cpp +++ b/aten/src/ATen/native/nested/NestedTensorMatmul.cpp @@ -1,4 +1,5 @@ #include +#include #include #include @@ -20,38 +21,30 @@ namespace at { namespace native { Tensor bmm_nested(const Tensor& self, const Tensor& mat2) { - if (self.is_nested() && !mat2.is_nested()) { - AT_ERROR("Expected both to be nested, but got a nested self and non-nested other"); - } - else if (!self.is_nested() && mat2.is_nested()) { - AT_ERROR("Expected both to be nested, but got a non-nested self and nested other"); - } - // dispatcher should have guaranteed that at least one is nested - auto self_ptr = get_nested_tensor_impl(self); - auto mat2_ptr = get_nested_tensor_impl(mat2); - TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor"); - TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor"); - int64_t ntensors = self_ptr->size(0), - ntensors2 = mat2_ptr->size(0); + TORCH_CHECK(self.dim() == 3, "batch1 must be a 3D tensor"); + TORCH_CHECK(mat2.dim() == 3, "batch2 must be a 3D tensor"); + + int64_t ntensors = self.is_nested() ? get_nested_tensor_impl(self)->size(0) : self.size(0); + int64_t ntensors2 = mat2.is_nested() ? get_nested_tensor_impl(mat2)->size(0) : mat2.size(0); + TORCH_CHECK(ntensors == ntensors2, "Expected size for the 1st dimension of batch2 tensor to be: ", ntensors, " but got: ", ntensors2, "."); - const Tensor& self_buffer = self_ptr->get_unsafe_storage_as_tensor(), - & mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor(); - std::vector self_sizes = NestedTensor_get_sizes(self_ptr), - mat2_sizes = NestedTensor_get_sizes(mat2_ptr), - self_strides = NestedTensor_get_strides(self_ptr), - mat2_strides = NestedTensor_get_strides(mat2_ptr); - int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr(); - int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr(); + + const Tensor& self_buffer = self.is_nested() ? get_nested_tensor_impl(self)->get_unsafe_storage_as_tensor() : self; + const Tensor& mat2_buffer = mat2.is_nested() ? get_nested_tensor_impl(mat2)->get_unsafe_storage_as_tensor() : mat2; + + // create a contiguous output int64_t out_numel = 0; - const Tensor& self_sizemat = self_ptr->get_nested_sizes(); + const Tensor& self_sizemat = self.is_nested() ? + get_nested_tensor_impl(self)->get_nested_sizes() : get_nested_tensor_impl(mat2)->get_nested_sizes(); + Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes()); int64_t* out_sizemat_ptr = out_sizemat.data_ptr(); for (int64_t i = 0; i < ntensors; i++) { - const IntArrayRef& self_shape = self_sizes[i], - & mat2_shape = mat2_sizes[i]; + const IntArrayRef& self_shape = get_size_for_index(self, i); + const IntArrayRef& mat2_shape = get_size_for_index(mat2, i); const int64_t& self_size0 = self_shape[0], & self_size1 = self_shape[1], & mat2_size0 = mat2_shape[0], & mat2_size1 = mat2_shape[1]; TORCH_CHECK(self_size1 == mat2_size0, @@ -63,7 +56,7 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) { out_sizemat_ptr += 2; out_numel += self_size0 * mat2_size1; } - Tensor out_buffer = self_buffer.new_empty(out_numel); + Tensor out_buffer = self.is_nested() ? self_buffer.new_empty(out_numel) : mat2_buffer.new_empty(out_numel); Tensor output = wrap_buffer(out_buffer, out_sizemat); // call tensor mm // TODO: `padding nested tensor -> bmm -> remove padding` may be more efficient @@ -73,12 +66,14 @@ Tensor bmm_nested(const Tensor& self, const Tensor& mat2) { std::vector output_unbind = output.unbind(); for (int64_t i = 0; i < ntensors; i++) { at::mm_out(output_unbind[i], - self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets_ptr[i]), - mat2_buffer.as_strided(mat2_sizes[i], mat2_strides[i], mat2_offsets_ptr[i])); + self_buffer.as_strided(get_size_for_index(self, i), get_stride_for_index(self, i), get_offset_for_index(self, i)), + mat2_buffer.as_strided(get_size_for_index(mat2, i), get_stride_for_index(mat2, i), get_offset_for_index(mat2, i))); } return output; } + + static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { // Tensor self = self_.contiguous(); // Tensor mat2 = mat2_.contiguous(); @@ -89,13 +84,15 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { // metadata for self std::vector self_sizes = NestedTensor_get_sizes(self_ptr); std::vector self_strides = NestedTensor_get_strides(self_ptr); - int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr(); + int64_t* self_offsets_ptr = + self_ptr->get_storage_offsets().data_ptr(); auto opt = self_ptr->get_nested_sizes().options(); // metadata for mat2 std::vector mat2_sizes = NestedTensor_get_sizes(mat2_ptr); std::vector mat2_strides = NestedTensor_get_strides(mat2_ptr); - int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr(); + int64_t* mat2_offsets_ptr = + mat2_ptr->get_storage_offsets().data_ptr(); auto opt2 = mat2_ptr->get_nested_sizes().options(); int64_t N = self_sizes.size(); @@ -108,7 +105,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { auto self_new_strides = at::empty({N * n_heads, 2}, opt); int64_t* self_new_strides_ptr = self_new_strides.mutable_data_ptr(); auto self_new_offsets = at::empty({N * n_heads}, opt); - int64_t *self_new_offsets_ptr = self_new_offsets.mutable_data_ptr(); + int64_t* self_new_offsets_ptr = self_new_offsets.mutable_data_ptr(); // viewed metadata for mat2 auto mat2_new_sizes = at::empty({N * n_heads, 2}, opt2); @@ -117,7 +114,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { auto mat2_new_strides = at::empty({N * n_heads, 2}, opt2); int64_t* mat2_new_strides_ptr = mat2_new_strides.mutable_data_ptr(); auto mat2_new_offsets = at::empty({N * n_heads}, opt); - int64_t *mat2_new_offsets_ptr = mat2_new_offsets.mutable_data_ptr(); + int64_t* mat2_new_offsets_ptr = mat2_new_offsets.mutable_data_ptr(); for (int64_t i = 0; i < N; i++) { const IntArrayRef& self_size_i = self_sizes[i]; @@ -146,7 +143,6 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { } } - // view self as [N * n_heads, *, head_dim] (collapse first 2 dims) auto viewed_self = create_nested_view_tensor( self, self_new_sizes, self_new_strides, self_new_offsets); @@ -163,7 +159,7 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { auto out_new_sizes = at::empty({N, 3}, opt); auto out_new_strides = at::empty({N, 3}, opt); auto out_new_offsets = at::empty({N}, opt); - int64_t *out_new_offsets_ptr = out_new_offsets.mutable_data_ptr(); + int64_t* out_new_offsets_ptr = out_new_offsets.mutable_data_ptr(); int64_t* out_new_sizes_ptr = out_new_sizes.data_ptr(); int64_t* out_new_strides_ptr = out_new_strides.data_ptr(); @@ -187,15 +183,16 @@ static Tensor matmul_with_bmm_nested(const Tensor& self, const Tensor& mat2) { bmm_output, out_new_sizes, out_new_strides, out_new_offsets); return viewed_out; - } // nt: NT of shape (B, *, C, D) // other: dense tensor of shape (D, E) // output: NT of shape (B, *, C, E) -static Tensor matmul_nested_with_broadcasted_dense(const Tensor& nt, const Tensor& other) { +static Tensor matmul_nested_with_broadcasted_dense( + const Tensor& nt, + const Tensor& other) { // View nt buffer as 3D jagged for matmul - auto *nt_impl = get_nested_tensor_impl(nt); + auto* nt_impl = get_nested_tensor_impl(nt); auto jagged = nt_impl->get_buffer().view({-1, nt.size(2), nt.size(3)}); auto new_buffer = at::matmul(jagged, other); @@ -222,22 +219,19 @@ static Tensor matmul_nested_with_broadcasted_dense(const Tensor& nt, const Tenso // TODO: Should make full matmul semantics support some day Tensor matmul_nested(const Tensor& self, const Tensor& mat2) { // special case of NT (B, *, C, D) with broadcasted dense (D, E) - if (self.is_nested() && - self.is_contiguous() && - !mat2.is_nested() && - self.dim() == 4 && - mat2.dim() == 2 && - get_nested_tensor_impl(self)->opt_size(2).has_value() && - get_nested_tensor_impl(self)->opt_size(3).has_value() && - self.size(3) == mat2.size(0) - ) { + if (self.is_nested() && self.is_contiguous() && !mat2.is_nested() && + self.dim() == 4 && mat2.dim() == 2 && + get_nested_tensor_impl(self)->opt_size(2).has_value() && + get_nested_tensor_impl(self)->opt_size(3).has_value() && + self.size(3) == mat2.size(0)) { return matmul_nested_with_broadcasted_dense(self, mat2); } if (self.is_nested() && !mat2.is_nested()) { - AT_ERROR("Expected both to be nested, but got a nested self and non-nested other"); - } - else if (!self.is_nested() && mat2.is_nested()) { - AT_ERROR("Expected both to be nested, but got a non-nested self and nested other"); + AT_ERROR( + "Expected both to be nested, but got a nested self and non-nested other"); + } else if (!self.is_nested() && mat2.is_nested()) { + AT_ERROR( + "Expected both to be nested, but got a non-nested self and nested other"); } // to_padded_tensor only supports contiguous inputs auto self_contig = self.contiguous(); @@ -245,8 +239,7 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) { // dispatcher should have guaranteed that at least one is nested const auto self_ptr = get_nested_tensor_impl(self_contig); const auto mat2_ptr = get_nested_tensor_impl(mat2_contig); - int64_t self_dim = self_ptr->dim(), - mat2_dim = mat2_ptr->dim(); + int64_t self_dim = self_ptr->dim(), mat2_dim = mat2_ptr->dim(); TORCH_CHECK( self_dim >= 3, "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 1st input has rank: ", @@ -255,41 +248,47 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) { mat2_dim >= 3, "matmul: For nested tensors, only inputs with >= 3 dims are currently supported. 2nd input has rank: ", mat2_dim); - TORCH_CHECK(self_dim == mat2_dim, "matmul: both inputs must have the same rank"); - int64_t ntensors = self_ptr->size(0), - ntensors2 = mat2_ptr->size(0); - TORCH_CHECK(ntensors == ntensors2, - "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ", ntensors, - " but got: ", ntensors2, "."); + TORCH_CHECK( + self_dim == mat2_dim, "matmul: both inputs must have the same rank"); + int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0); + TORCH_CHECK( + ntensors == ntensors2, + "matmul: Expected size for the 1st dimension of 2nd input tensor to be: ", + ntensors, + " but got: ", + ntensors2, + "."); // Ensure batch dimensions have the same sizes (no broadcasting). const auto& self_sizes = self_ptr->get_nested_sizes(); const auto& mat2_sizes = mat2_ptr->get_nested_sizes(); - const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim-3); - const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim-3); - TORCH_CHECK(at::equal(self_batch_sizes, mat2_batch_sizes), - "matmul: For nested tensors, batch dimensions must have the same sizes, ", - "no broadcasting is currently performed. Got batch shapes for self ", - self_batch_sizes, - " and batch shapes for mat2 ", - mat2_batch_sizes); + const auto& self_batch_sizes = self_sizes.narrow(1, 0, self_dim - 3); + const auto& mat2_batch_sizes = mat2_sizes.narrow(1, 0, mat2_dim - 3); + TORCH_CHECK( + at::equal(self_batch_sizes, mat2_batch_sizes), + "matmul: For nested tensors, batch dimensions must have the same sizes, ", + "no broadcasting is currently performed. Got batch shapes for self ", + self_batch_sizes, + " and batch shapes for mat2 ", + mat2_batch_sizes); // Ensure last dim of self and second last dim of mat2 have the same size const auto& self_dim_size = self_sizes.select(1, -1); const auto& mat2_dim_size = mat2_sizes.select(1, -2); - TORCH_CHECK(at::equal(self_dim_size, mat2_dim_size), - "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes", - self_dim_size, - "second last dimension of mat2 has sizes", - mat2_dim_size); - - // use bmm inference-only fast path for [N, n_heads, *, head_dim] [N, n_heads, head_dim, *] - if (self.is_cuda() && - self_dim == 4 && self.is_contiguous() && + TORCH_CHECK( + at::equal(self_dim_size, mat2_dim_size), + "matmul: Nested tensors cannot be matrix multiplied, last dimension of self has sizes", + self_dim_size, + "second last dimension of mat2 has sizes", + mat2_dim_size); + + // use bmm inference-only fast path for [N, n_heads, *, head_dim] [N, n_heads, + // head_dim, *] + if (self.is_cuda() && self_dim == 4 && self.is_contiguous() && mat2_dim == 4 && mat2.is_contiguous() && - !(GradMode::is_enabled() && (self.requires_grad() || mat2.requires_grad()))) { + !(GradMode::is_enabled() && + (self.requires_grad() || mat2.requires_grad()))) { const auto& self_opt_head_dim = self_ptr->opt_size(1); const auto& mat2_opt_head_dim = mat2_ptr->opt_size(1); - if (self_opt_head_dim.has_value() && - mat2_opt_head_dim.has_value() && + if (self_opt_head_dim.has_value() && mat2_opt_head_dim.has_value() && self_opt_head_dim.value() == mat2_opt_head_dim.value()) { return matmul_with_bmm_nested(self, mat2); } @@ -297,8 +296,10 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) { // Construct output size from input sizes Tensor output_sizes = self_sizes.clone(); - // The last entry in every row of output_sizes should be last column of mat2_sizes - output_sizes.index_put_({at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone()); + // The last entry in every row of output_sizes should be last column of + // mat2_sizes + output_sizes.index_put_( + {at::indexing::Slice(), -1}, mat2_sizes.select(1, -1).clone()); auto self_padded = self_contig.to_padded_tensor(0.); auto mat2_padded = mat2_contig.to_padded_tensor(0.); @@ -307,7 +308,10 @@ Tensor matmul_nested(const Tensor& self, const Tensor& mat2) { return output_nested; } -Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor& result) { +Tensor& matmul_out_nested( + const Tensor& tensor1, + const Tensor& tensor2, + Tensor& result) { // TODO: this is a very quick and dirty implementation // should improve it to avoid the intermediate memory usage Tensor function_result = at::matmul(tensor1, tensor2); @@ -319,8 +323,7 @@ Tensor& matmul_out_nested(const Tensor& tensor1, const Tensor& tensor2, Tensor& c10::optional opt_size = function_result_ptr->opt_size(i); if (opt_size.has_value()) { sizes.push_back(*opt_size); - } - else { + } else { sizes.push_back(-1); } } diff --git a/aten/src/ATen/native/nested/NestedTensorUtils.h b/aten/src/ATen/native/nested/NestedTensorUtils.h index 206899875156b..3b4f18f11b64b 100644 --- a/aten/src/ATen/native/nested/NestedTensorUtils.h +++ b/aten/src/ATen/native/nested/NestedTensorUtils.h @@ -119,7 +119,7 @@ inline std::vector NestedTensor_get_sizes( if (orig_dim == 0) { return sizes; } - const int64_t* sizemat_ptr = sizemat.data_ptr(); + const int64_t* sizemat_ptr = sizemat.const_data_ptr(); for (const auto i : c10::irange(ntensors)) { sizes[i] = IntArrayRef(sizemat_ptr, sizemat_ptr + orig_dim); @@ -152,7 +152,7 @@ inline std::vector NestedTensor_get_strides( if (orig_dim == 0) { return strides; } - const int64_t* stridemat_ptr = stridemat.data_ptr(); + const int64_t* stridemat_ptr = stridemat.const_data_ptr(); for (const auto i : c10::irange(ntensors)) { strides[i] = IntArrayRef(stridemat_ptr, stridemat_ptr + orig_dim); stridemat_ptr += orig_dim; @@ -178,6 +178,40 @@ inline void check_numel_equals_buffer_size(const NestedTensorImpl* self_ptr) { self_ptr->numel() == static_cast(self_ptr->get_buffer_size()), "Number of elements in nested tensor must match number of elements in buffer."); } + +// Helper function to get size / stride / offset for a nested/normal tensor. +inline IntArrayRef get_size_for_index(const Tensor& tensor, int i) { + if (tensor.is_nested()) { + std::vector tensor_sizes = + NestedTensor_get_sizes(get_nested_tensor_impl(tensor)); + return tensor_sizes[i]; + } else { + return tensor.sizes().slice(1); + } +} + +inline IntArrayRef get_stride_for_index(const Tensor& tensor, int i) { + if (tensor.is_nested()) { + std::vector tensor_strides = + NestedTensor_get_strides(get_nested_tensor_impl(tensor)); + return tensor_strides[i]; + } else { + return tensor.strides().slice(1); + } +} + +inline int64_t get_offset_for_index(const Tensor& tensor, int i) { + if (tensor.is_nested()) { + int64_t* offsets_ptr = get_nested_tensor_impl(tensor) + ->get_storage_offsets() + .data_ptr(); + return offsets_ptr[i]; + + } else { + int64_t offset = tensor.storage_offset(); + return offset + tensor.strides()[0] * i; + } +} // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // Data structures and functions for generically applying a function on a nested // tensor. @@ -193,7 +227,8 @@ struct NestedNode { // NestedNode(NestedNode&) = delete; // NestedNode(const NestedNode&) = delete; // NestedNode& operator=(NestedNode) = delete; - explicit NestedNode(T payload) : _is_leaf(true), _payload(std::move(payload)) {} + explicit NestedNode(T payload) + : _is_leaf(true), _payload(std::move(payload)) {} inline bool is_leaf() const { return _is_leaf; } @@ -367,7 +402,7 @@ inline Tensor wrap_tensor_node( if (tensor_node.children(i).numel() > 0) { memcpy( nt_buffer.mutable_data_ptr() + start_offsets[i], - tensor_node.children(i).data_ptr(), + tensor_node.children(i).const_data_ptr(), tensor_node.children(i).numel() * sizeof(scalar_t)); } } diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu index f7055d7fd0330..350c3a27e77b0 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu +++ b/aten/src/ATen/native/nested/cuda/NestedTensorBinaryOps.cu @@ -85,8 +85,8 @@ void _nested_op_dense_esuhm_kernel(Tensor& result, const Tensor& self, const Ten auto result_offsets = at::cat({offsets, at::tensor(self_ptr->numel())}); result_offsets = result_offsets.to(kCUDA); - const scalar_t* self_data_ptr = self_buffer.data_ptr(); - const scalar_t* other_data_ptr = other.data_ptr(); + const scalar_t* self_data_ptr = self_buffer.const_data_ptr(); + const scalar_t* other_data_ptr = other.const_data_ptr(); scalar_t* result_data_ptr = result_buffer.data_ptr(); int64_t* result_offsets_ptr = result_offsets.data_ptr(); diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu index f5b56b2a8c47e..252e3741c5c7d 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu +++ b/aten/src/ATen/native/nested/cuda/NestedTensorMatmul.cu @@ -283,16 +283,10 @@ bool group_gemm_dispatch( #endif Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) { - if (self.is_nested() && !mat2.is_nested()) { - AT_ERROR( - "Expected both to be nested, but got a nested self and non-nested other"); - } else if (!self.is_nested() && mat2.is_nested()) { - AT_ERROR( - "Expected both to be nested, but got a non-nested self and nested other"); - } + // dispatcher should have guaranteed that at least one is nested - auto self_ptr = get_nested_tensor_impl(self); - auto mat2_ptr = get_nested_tensor_impl(mat2); + auto self_ptr = self.is_nested() ? get_nested_tensor_impl(self) : self.unsafeGetTensorImpl(); + auto mat2_ptr = mat2.is_nested() ? get_nested_tensor_impl(mat2) : mat2.unsafeGetTensorImpl(); TORCH_CHECK(self_ptr->dim() == 3, "batch1 must be a 3D tensor"); TORCH_CHECK(mat2_ptr->dim() == 3, "batch2 must be a 3D tensor"); int64_t ntensors = self_ptr->size(0), ntensors2 = mat2_ptr->size(0); @@ -305,16 +299,15 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) { "."); // create a contiguous output - const Tensor& self_sizemat = self_ptr->get_nested_sizes(); + const Tensor& self_sizemat = self.is_nested() ? + get_nested_tensor_impl(self)->get_nested_sizes() : get_nested_tensor_impl(mat2)->get_nested_sizes(); + Tensor out_sizemat = self_sizemat.new_empty(self_sizemat.sizes()); int64_t* out_sizemat_ptr = out_sizemat.data_ptr(); - std::vector self_sizes = NestedTensor_get_sizes(self_ptr); - std::vector mat2_sizes = NestedTensor_get_sizes(mat2_ptr); - int64_t out_numel = 0; for (int64_t i = 0; i < ntensors; i++) { - const IntArrayRef &self_shape = self_sizes[i], &mat2_shape = mat2_sizes[i]; + const IntArrayRef &self_shape = get_size_for_index(self, i), &mat2_shape = get_size_for_index(mat2, i); const int64_t &self_size0 = self_shape[0], &self_size1 = self_shape[1], &mat2_size0 = mat2_shape[0], &mat2_size1 = mat2_shape[1]; TORCH_CHECK( @@ -334,17 +327,15 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) { out_sizemat_ptr += 2; out_numel += self_size0 * mat2_size1; } - const Tensor &self_buffer = self_ptr->get_unsafe_storage_as_tensor(); - const Tensor &mat2_buffer = mat2_ptr->get_unsafe_storage_as_tensor(); + + const Tensor &self_buffer = self.is_nested() ? get_nested_tensor_impl(self)->get_unsafe_storage_as_tensor() : self; + const Tensor &mat2_buffer = mat2.is_nested() ? get_nested_tensor_impl(mat2)->get_unsafe_storage_as_tensor() : mat2; + Tensor out_buffer = self_buffer.new_empty(out_numel); Tensor output = wrap_buffer(out_buffer, out_sizemat); auto out_ptr = get_nested_tensor_impl(output); - std::vector self_strides = NestedTensor_get_strides(self_ptr); - std::vector mat2_strides = NestedTensor_get_strides(mat2_ptr); - const int64_t *self_offsets_ptr = self_ptr->get_storage_offsets().data_ptr(); - const int64_t *mat2_offsets_ptr = mat2_ptr->get_storage_offsets().data_ptr(); - const int64_t *out_offsets_ptr = out_ptr->get_storage_offsets().data_ptr(); + const int64_t *out_offsets_ptr = out_ptr->get_storage_offsets().const_data_ptr(); #ifndef USE_ROCM #ifndef _WIN32 @@ -360,21 +351,23 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) { std::vector gemm_sizes; bool all_row_major = true; for (int64_t i = 0; i < ntensors; i++) { - const IntArrayRef& self_shape = self_sizes[i]; - const IntArrayRef& mat2_shape = mat2_sizes[i]; + const IntArrayRef& self_shape = get_size_for_index(self, i); + const IntArrayRef& mat2_shape = get_size_for_index(mat2, i); const int64_t &self_size0 = self_shape[0]; const int64_t &self_size1 = self_shape[1]; const int64_t &mat2_size0 = mat2_shape[0]; const int64_t &mat2_size1 = mat2_shape[1]; gemm_sizes.push_back( cutlass::gemm::GemmCoord(self_size0, mat2_size1, self_size1)); - aptr[i] = self_buffer.data_ptr() + self_offsets_ptr[i]; - bptr[i] = mat2_buffer.data_ptr() + mat2_offsets_ptr[i]; + aptr[i] = self_buffer.data_ptr() + get_offset_for_index(self, i); + bptr[i] = mat2_buffer.data_ptr() + get_offset_for_index(mat2, i); dptr[i] = out_buffer.data_ptr() + out_offsets_ptr[i]; - all_row_major = all_row_major && (self_strides[i][1] == 1); - all_row_major = all_row_major && (mat2_strides[i][1] == 1); - lda[i] = self_strides[i][0]; - ldb[i] = mat2_strides[i][0]; + auto self_stride = get_stride_for_index(self, i); + auto mat2_stride = get_stride_for_index(mat2, i); + all_row_major = all_row_major && (self_stride[1] == 1); + all_row_major = all_row_major && (mat2_stride[1] == 1); + lda[i] = self_stride[0]; + ldb[i] = mat2_stride[0]; ldd[i] = mat2_size1; } auto dprops = at::cuda::getCurrentDeviceProperties(); @@ -403,11 +396,9 @@ Tensor bmm_nested_cuda(const Tensor& self, const Tensor& mat2) { std::vector output_unbind = output.unbind(); for (int64_t i = 0; i < ntensors; i++) { - at::mm_out( - output_unbind[i], - self_buffer.as_strided(self_sizes[i], self_strides[i], self_offsets_ptr[i]), - mat2_buffer.as_strided( - mat2_sizes[i], mat2_strides[i], mat2_offsets_ptr[i])); + at::mm_out(output_unbind[i], + self_buffer.as_strided(get_size_for_index(self, i), get_stride_for_index(self, i), get_offset_for_index(self, i)), + mat2_buffer.as_strided(get_size_for_index(mat2, i), get_stride_for_index(mat2, i), get_offset_for_index(mat2, i))); } return output; } diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp index 8955585b432e8..0da0c3e361d1f 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerFunctions.cpp @@ -28,7 +28,7 @@ namespace { int64_t padded_tensor_numel(const Tensor& sizes) { const auto sizes_num_rows = sizes.sizes()[0]; const auto sizes_row_length = sizes.sizes()[1]; - const auto* sizes_data = sizes.data_ptr(); + const auto* sizes_data = sizes.const_data_ptr(); int64_t numel = 0; for (const auto row_num : c10::irange(sizes_num_rows)) { const auto* row_ptr = sizes_data + row_num * sizes_row_length; diff --git a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp index a209c766c24ec..f708920d04dfa 100644 --- a/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp +++ b/aten/src/ATen/native/nested/cuda/NestedTensorTransformerUtils.cpp @@ -78,8 +78,8 @@ int64_t get_nnz(Tensor nestedtensor) { * use with the flash-attention and efficient_attention kernels without * needing to call contiguous on the nested tensor input. * It checks that the storage offsets' adjacent_differences are a constant - * mutiple of the previous tensor in the nested tensor and that the strides - * are monitonically decreasing. This check is done after calling transpose on + * multiple of the previous tensor in the nested tensor and that the strides + * are monotonically decreasing. This check is done after calling transpose on * the nested tensor. Resulting in a Nt of shape [bsz, {seq_len}, num_heads, dim] * * @return A boolean indicating of contiguous needs to be called for input @@ -133,8 +133,8 @@ int64_t get_nnz(Tensor nestedtensor) { } // Check the offsets are a constant multiple from the previous numels - const int64_t* tensor_size_ptr = tensor_sizes.data_ptr(); - const int64_t* tensor_stride_ptr = tensor_strides.data_ptr(); + const int64_t* tensor_size_ptr = tensor_sizes.const_data_ptr(); + const int64_t* tensor_stride_ptr = tensor_strides.const_data_ptr(); int64_t numel_0 = (tensor_size_ptr[0] * tensor_stride_ptr[0]); TORCH_INTERNAL_ASSERT(numel_0 > 0, "numels must be positive!"); diff --git a/aten/src/ATen/native/quantized/ConvUtils.h b/aten/src/ATen/native/quantized/ConvUtils.h index 092f68e7d5b63..6f8ff918c1d2f 100644 --- a/aten/src/ATen/native/quantized/ConvUtils.h +++ b/aten/src/ATen/native/quantized/ConvUtils.h @@ -5,7 +5,7 @@ namespace at::native::quantized { namespace { // MakeConvOutputShape used from both CPU and CUDA libraries -// and exporting symbol from torch_cpu would probaby take more storage +// and exporting symbol from torch_cpu would probably take more storage // than duplicating implementation which likely be inlined away template at::SmallVector MakeConvOutputShape( diff --git a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp index 91db5a7ae313a..25dbc0926ccf0 100644 --- a/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp +++ b/aten/src/ATen/native/quantized/FakeQuantPerChannelAffine.cpp @@ -48,6 +48,8 @@ std::tuple fake_quantize_per_channel_affine_cachemask( int64_t axis, int64_t quant_min, int64_t quant_max) { + TORCH_CHECK(scale.scalar_type() == ScalarType::Float, + "Scale must be Float, found ", scale.scalar_type()); TORCH_CHECK(zero_point.scalar_type() == ScalarType::Int || zero_point.scalar_type() == ScalarType::Float || zero_point.scalar_type() == ScalarType::Half, "Zero-point must be Int32, Float or Half, found ", zero_point.scalar_type()); TORCH_CHECK(scale.dim() == 1, "scale should be a 1-D tensor"); diff --git a/aten/src/ATen/native/quantized/QTensor.cpp b/aten/src/ATen/native/quantized/QTensor.cpp index b8841214fdcb2..9705de0a4a54d 100644 --- a/aten/src/ATen/native/quantized/QTensor.cpp +++ b/aten/src/ATen/native/quantized/QTensor.cpp @@ -344,7 +344,7 @@ std::tuple choose_qparams_optimized( TORCH_CHECK(numel <= input_tensor.numel(), "numel ", numel, " greater than input_tensor.numel() ", input_tensor.numel()); - const float* input_row = input_tensor.data_ptr(); + const float* input_row = input_tensor.const_data_ptr(); float xmin = *std::min_element(input_row, input_row + numel); float xmax = *std::max_element(input_row, input_row + numel); @@ -352,7 +352,7 @@ std::tuple choose_qparams_optimized( // NOLINTNEXTLINE(cppcoreguidelines-narrowing-conversions,bugprone-narrowing-conversions) int min_bins = n_bins * (1.0 - (float) ratio); Tensor input_tensor_contig = input_tensor.contiguous(); - const float* input = input_tensor_contig.data_ptr(); + const float* input = input_tensor_contig.const_data_ptr(); std::vector q_input(numel); float loss = diff --git a/aten/src/ATen/native/quantized/TensorCompare.cpp b/aten/src/ATen/native/quantized/TensorCompare.cpp index 25cf5c6c93a35..def1622863e1d 100644 --- a/aten/src/ATen/native/quantized/TensorCompare.cpp +++ b/aten/src/ATen/native/quantized/TensorCompare.cpp @@ -50,9 +50,7 @@ std::tuple sort_quantized_cpu_stable( c10::optional stable, int64_t dim, bool descending) { - Tensor sort_int; - Tensor sort_indicies; - std::tie(sort_int, sort_indicies) = + auto [sort_int, sort_indicies] = at::sort(self.int_repr(), stable, dim, descending); return std::forward_as_tuple( at::_make_per_tensor_quantized_tensor( diff --git a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp index 1317817902cfb..74476e0a80ae0 100644 --- a/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp +++ b/aten/src/ATen/native/quantized/cpu/AdaptiveAveragePooling.cpp @@ -16,7 +16,6 @@ #endif #include -#include #include #include diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp index bb72a2010ca3b..754c7d6bd529b 100644 --- a/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/AveragePool2d.cpp @@ -18,7 +18,6 @@ #endif #include -#include #include #include @@ -188,10 +187,9 @@ Tensor q_avg_pool2d( bool count_include_pad, c10::optional divisor_override) { // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int kW, kH, dW, dH, padW, padH; - std::tie(kW, kH) = get_kernel(kernel_size); - std::tie(dW, dH) = get_stride(stride, kW, kH); - std::tie(padW, padH) = get_padding(padding); + auto [kW, kH] = get_kernel(kernel_size); + auto [dW, dH] = get_stride(stride, kW, kH); + auto [padW, padH] = get_padding(padding); const int64_t nbatch = input.ndimension() == 4 ? input.size(-4) : 1; const int64_t nInputPlane = input.size(-3); @@ -268,12 +266,9 @@ Tensor qnnpack_avg_pool2d( bool ceil_mode, bool count_include_pad, c10::optional divisor_override) { - Tensor output; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int kW, kH, dW, dH, padW, padH; - std::tie(kW, kH) = get_kernel(kernel_size); - std::tie(dW, dH) = get_stride(stride, kW, kH); - std::tie(padW, padH) = get_padding(padding); + auto [kW, kH] = get_kernel(kernel_size); + auto [dW, dH] = get_stride(stride, kW, kH); + auto [padW, padH] = get_padding(padding); TORCH_CHECK( input.ndimension() == 4, "qnnpack_avg_pool2d(): Expected input to be 4-dimensional: got ", @@ -304,7 +299,7 @@ Tensor qnnpack_avg_pool2d( oH > 0 && oW > 0, "qnnpack_avg_pool2d(): the resulting output Tensor size should be >= 0"); // NHWC output - output = at::_empty_affine_quantized( + auto output = at::_empty_affine_quantized( output_shape, at::device(kCPU).dtype(kQUInt8), scale, diff --git a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp index 93534b70c2c0f..875ae28e46a96 100644 --- a/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp +++ b/aten/src/ATen/native/quantized/cpu/AveragePool3d.cpp @@ -14,8 +14,6 @@ #include #endif -#include - #include namespace at { @@ -103,11 +101,9 @@ Tensor q_avg_pool3d( bool ceil_mode, bool count_include_pad, c10::optional divisor_override) { - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int kD, kW, kH, dD, dW, dH, padD, padW, padH; - std::tie(kW, kH, kD) = get_kernel(kernel_size); - std::tie(dW, dH, dD) = get_stride(stride, kW, kH, kD); - std::tie(padW, padH, padD) = get_padding(padding); + auto [kW, kH, kD] = get_kernel(kernel_size); + auto [dW, dH, dD] = get_stride(stride, kW, kH, kD); + auto [padW, padH, padD] = get_padding(padding); const int64_t nbatch = input.ndimension() == 5 ? input.size(-5) : 1; const int64_t nInputPlane = input.size(-4); diff --git a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp index a25e2e23a32df..8b5fb286ec611 100644 --- a/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp +++ b/aten/src/ATen/native/quantized/cpu/BinaryOps.cpp @@ -326,8 +326,8 @@ Tensor xnnp_add(Tensor qa, Tensor qb, double scale, int64_t zero_point) { auto output_min = std::numeric_limits::min(); if (ReLUFused) { /* - * FIXME: use acticationLimits() - * With , MSVC runs into "error C3862: indetifier activationLimits not found". + * FIXME: use activationLimits() + * With , MSVC runs into "error C3862: identifier activationLimits not found". */ constexpr int64_t qmin = std::numeric_limits::min(); constexpr int64_t qmax = std::numeric_limits::max(); @@ -405,7 +405,7 @@ Tensor qadd(Tensor qa, Tensor qb, double scale, int64_t zero_point) { #endif // USE_XNNPACK #ifdef USE_PYTORCH_QNNPACK - if(qa.sizes() == qb.sizes() && /* qnnpack does not support boradcasting */ + if(qa.sizes() == qb.sizes() && /* qnnpack does not support boardcasting */ qa.scalar_type() == kQUInt8) { return qnnpack_add(qa, qb, scale, zero_point); } diff --git a/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp b/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp index 9867a8f48a9ea..cfcce3465a731 100644 --- a/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp +++ b/aten/src/ATen/native/quantized/cpu/IntReprQuant.cpp @@ -32,7 +32,7 @@ Tensor int_repr_quantized_cpu(const Tensor& self) { {out_size}, self.options().dtype(UNDERLYING_TYPE), self.suggest_memory_format()); - const underlying_t* qdata = reinterpret_cast(self.data_ptr()); + const underlying_t* qdata = reinterpret_cast(self.const_data_ptr()); for (const auto i : c10::irange(dst.numel())) { dst[i] = static_cast(qdata[i]); } diff --git a/aten/src/ATen/native/quantized/cpu/Normalization.cpp b/aten/src/ATen/native/quantized/cpu/Normalization.cpp index 05a9585274306..0f5fb9884a9c5 100644 --- a/aten/src/ATen/native/quantized/cpu/Normalization.cpp +++ b/aten/src/ATen/native/quantized/cpu/Normalization.cpp @@ -80,8 +80,8 @@ Tensor q_batch_norm1d_impl( TORCH_CHECK(weight.numel() == C, "Expect weight size to match C"); TORCH_CHECK(bias.numel() == C, "Expect weight size to match C"); - const float* weight_data = weight.template data_ptr(); - const float* bias_data = bias.template data_ptr(); + const float* weight_data = weight.template const_data_ptr(); + const float* bias_data = bias.template const_data_ptr(); TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension"); TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension"); @@ -91,8 +91,8 @@ Tensor q_batch_norm1d_impl( float* alpha_data = alpha.mutable_data_ptr(); float* beta_data = beta.data_ptr(); - const float* mean_data = mean.template data_ptr(); - const float* var_data = var.template data_ptr(); + const float* mean_data = mean.template const_data_ptr(); + const float* var_data = var.template const_data_ptr(); if (ndim == 2) { // create a fake H and W dimension so we can use NHWC @@ -189,8 +189,8 @@ Tensor q_batch_norm2d_impl( TORCH_CHECK(weight.numel() == C, "Expect weight size to match C"); TORCH_CHECK(bias.numel() == C, "Expect weight size to match C"); - const float* weight_data = weight.template data_ptr(); - const float* bias_data = bias.template data_ptr(); + const float* weight_data = weight.template const_data_ptr(); + const float* bias_data = bias.template const_data_ptr(); TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension"); TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension"); @@ -200,8 +200,8 @@ Tensor q_batch_norm2d_impl( float* alpha_data = alpha.mutable_data_ptr(); float* beta_data = beta.data_ptr(); - const float* mean_data = mean.template data_ptr(); - const float* var_data = var.template data_ptr(); + const float* mean_data = mean.template const_data_ptr(); + const float* var_data = var.template const_data_ptr(); auto oSizes = qx.sizes(); auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast); @@ -285,8 +285,8 @@ Tensor q_batch_norm3d_impl( TORCH_CHECK(weight.numel() == C, "Expect weight size to match C"); TORCH_CHECK(bias.numel() == C, "Expect weight size to match C"); - const float* weight_data = weight.template data_ptr(); - const float* bias_data = bias.template data_ptr(); + const float* weight_data = weight.template const_data_ptr(); + const float* bias_data = bias.template const_data_ptr(); TORCH_CHECK(mean.numel() == C, "Mean size must match channel dimension"); TORCH_CHECK(var.numel() == C, "Variance size must match channel dimension"); @@ -296,8 +296,8 @@ Tensor q_batch_norm3d_impl( float* alpha_data = alpha.mutable_data_ptr(); float* beta_data = beta.data_ptr(); - const float* mean_data = mean.template data_ptr(); - const float* var_data = var.template data_ptr(); + const float* mean_data = mean.template const_data_ptr(); + const float* var_data = var.template const_data_ptr(); auto oSizes = qx.sizes(); auto qx_nhwc = qx.contiguous(MemoryFormat::ChannelsLast3d); diff --git a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h index 4537feddd0c7b..8887bb83deb91 100644 --- a/aten/src/ATen/native/quantized/cpu/OnednnUtils.h +++ b/aten/src/ATen/native/quantized/cpu/OnednnUtils.h @@ -115,13 +115,6 @@ enum PostOps { Gelu }; -static std::unordered_map POST_OP_TABLE = { - {"none", NoPostOp}, - {"relu", Relu}, - {"leaky_relu", LeakyRelu}, - {"tanh", Tanh}, - {"gelu", Gelu} -}; struct PackedLinearWeightsOnednn : public LinearPackedParamsBase { PackedLinearWeightsOnednn( @@ -317,19 +310,81 @@ struct PackedConvWeightsOnednn : public ConvPackedParamsBase { namespace onednn_utils { static ideep::attr_t create_attr_by_post_op( - const std::string& post_op_name, - const torch::List>& post_op_args, - const dnnl::algorithm post_algorithm) { + const c10::string_view& binary_post_op, + double binary_alpha, + double input1_scale, + int64_t input1_zero_point, + const ideep::tensor::desc& input1_desc, + const c10::string_view& unary_post_op, + const torch::List>& unary_post_op_args, + const c10::string_view& unary_post_op_algorithm) { using ideep::tensor; - PostOps post_op = POST_OP_TABLE[post_op_name]; - if (post_op == Relu) { - return ideep::attr_t::fuse_relu(); - } else if (post_op == LeakyRelu) { - return ideep::attr_t::fuse_relu_v2(/*alpha=*/post_op_args[0].value().to()); - } else if (post_op == Tanh) { - return ideep::attr_t::fuse_tanh(); - } else if (post_op == Gelu) { - return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm); + if (binary_post_op == "none") { + if (unary_post_op == "relu") { + return ideep::attr_t::fuse_relu(); + } else if (unary_post_op == "leaky_relu") { + TORCH_CHECK( + unary_post_op_args.size() == 1, + "onednn qlinear: expect one argument for post op leaky_relu but got ", unary_post_op_args.size(), " args"); + auto alpha = unary_post_op_args[0].value().to(); + return ideep::attr_t::fuse_relu_v2(alpha); + } else if (unary_post_op == "tanh") { + return ideep::attr_t::fuse_tanh(); + } else if (unary_post_op == "gelu") { + TORCH_CHECK( + unary_post_op_algorithm == "none" || unary_post_op_algorithm == "tanh", + "onednn qlinear: algorithm for post op gelu must be none or tanh but got ", unary_post_op_algorithm); + auto post_algorithm = unary_post_op_algorithm == "none" ? + dnnl::algorithm::eltwise_gelu_erf : + dnnl::algorithm::eltwise_gelu_tanh; + return ideep::attr_t::fuse_gelu_v2(0.f, 0.f, post_algorithm); + } else if (unary_post_op == "hardtanh") { + TORCH_CHECK( + unary_post_op_args.size() == 2 && + unary_post_op_args[0].has_value() && + unary_post_op_args[1].has_value(), + "hardtanh is expected to have two scalar input: min_val and max_val"); + auto lower_bound_value = + unary_post_op_args[0].value().to(); + auto upper_bound_value = + unary_post_op_args[1].value().to(); + return ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); + } else if (unary_post_op == "hardswish") { + return ideep::attr_t::fuse_hardswish(); + } else if (unary_post_op == "swish") { + return ideep::attr_t::fuse_swish(); + } else { + TORCH_CHECK( + unary_post_op == "none", + "onednn qlinear: unsupported unary post op ", unary_post_op); + } + } else if (binary_post_op == "sum") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_sum(input1_scale, input1_zero_point); + } else if (unary_post_op == "relu") { + return ideep::attr_t::residual_with_sum_zero_point(input1_scale, input1_zero_point); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op sum"); + } + } else if (binary_post_op == "add") { + if (unary_post_op == "none") { + return ideep::attr_t::fuse_binary(ideep::algorithm::binary_add, input1_desc); + } else if (unary_post_op == "relu") { + ideep::post_ops po; + po.append_binary(ideep::algorithm::binary_add, input1_desc); + po.append_eltwise(ideep::algorithm::eltwise_relu, 0, 0); + return ideep::attr_t::attr_post_ops(po); + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported unary post op ", unary_post_op, " with binary post op add"); + } + } else { + TORCH_CHECK( + false, + "onednn qlinear: unsupported binary post op ", binary_post_op); } return ideep::attr_t(); } @@ -430,7 +485,7 @@ static at::Tensor _quantized_convolution_onednn( torch::List dilation, bool transposed, int64_t groups, - double inv_output_scale, + double output_scale, int64_t output_zero_point, c10::optional accum=c10::nullopt, // accum to fused with conv add double accum_scale=1.0, diff --git a/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp b/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp index c9aeb06930ddd..4a9791eb0faf3 100644 --- a/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp +++ b/aten/src/ATen/native/quantized/cpu/RuyUtils.cpp @@ -32,6 +32,6 @@ void quantize_multiplier(double scale, } // namespace ruy_utils } // namespace native -} // namesplace +} // namespace #endif // USE_RUY_QMATMUL diff --git a/aten/src/ATen/native/quantized/cpu/RuyUtils.h b/aten/src/ATen/native/quantized/cpu/RuyUtils.h index aeb332af4ecae..72abe1ad817f4 100644 --- a/aten/src/ATen/native/quantized/cpu/RuyUtils.h +++ b/aten/src/ATen/native/quantized/cpu/RuyUtils.h @@ -16,6 +16,6 @@ void quantize_multiplier(double scale, } // namespace ruy_utils } // namespace native -} // namesplace +} // namespace #endif // USE_RUY_QMATMUL diff --git a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp index 8a3d4b737f777..f428745eaa86f 100644 --- a/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp +++ b/aten/src/ATen/native/quantized/cpu/UpSampleBilinear2d.cpp @@ -22,7 +22,7 @@ namespace at { namespace native { namespace { -// pre calcuate interpolation params on width +// pre calculate interpolation params on width struct UpsampleBilinearParamW { int64_t w1, w1p; float w0lambda, w1lambda; @@ -48,14 +48,14 @@ static void upsample_bilinear2d_out_frame( bool align_corners, c10::optional scales_h, c10::optional scales_w) { - auto* idata = static_cast(input.data_ptr()); + auto* idata = static_cast(input.const_data_ptr()); auto* odata = static_cast(output.data_ptr()); channels = channels * nbatch; if (channels == 0 || output_height == 0 || output_width == 0) { return; } - auto* i_p = reinterpret_cast(idata); + auto* i_p = reinterpret_cast(idata); auto* o_p = reinterpret_cast(odata); // special case: just copy diff --git a/aten/src/ATen/native/quantized/cpu/conv_serialization.h b/aten/src/ATen/native/quantized/cpu/conv_serialization.h index 3c79d806d31c2..9f452a1cc7213 100644 --- a/aten/src/ATen/native/quantized/cpu/conv_serialization.h +++ b/aten/src/ATen/native/quantized/cpu/conv_serialization.h @@ -236,9 +236,7 @@ ConvParamsSerializationTypeV2 serialize_conv( // clone to retain ownership of the data .clone(); - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = params->unpack(); + auto [weight, bias] = params->unpack(); non_optional.emplace_back(std::move(params_tensor)); non_optional.emplace_back(std::move(weight)); @@ -267,9 +265,7 @@ ConvParamsSerializationTypeV3 serialize_conv( config_vals.push_back(params->groups()); config_vals.push_back(params->transpose()); - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = params->unpack(); + auto [weight, bias] = params->unpack(); std::vector> tensors; tensors.emplace_back(); @@ -287,12 +283,7 @@ ConvParamsSerializationTypeV3 serialize_conv( template c10::intrusive_ptr> deserialize_conv( ConvParamsSerializationTypeV3 state) { - - int64_t version; - std::vector config_vals; - std::vector> tensors; - - std::tie(version, config_vals, tensors) = state; + auto [version, config_vals, tensors] = state; TORCH_INTERNAL_ASSERT(version == 3, "Unexpected serialized qconv version: ", version); TORCH_CHECK(tensors.size() == 3, "Wrong number of tensors", tensors.size()); diff --git a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp index 2d15e54c4052b..d942e2f161a26 100644 --- a/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp +++ b/aten/src/ATen/native/quantized/cpu/fbgemm_utils.cpp @@ -410,16 +410,10 @@ register_conv_params() { return deserialize_conv(state); }) .def("weight", [](const c10::intrusive_ptr>& self) { - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = self->unpack(); - return weight; + return std::get<0>(self->unpack()); }) .def("bias", [](const c10::intrusive_ptr>& self) { - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = self->unpack(); - return bias; + return std::get<1>(self->unpack()); }) .def("unpack", &ConvPackedParamsBase::unpack) .def("stride", &ConvPackedParamsBase::stride) @@ -446,10 +440,7 @@ TORCH_API int register_linear_params() { .def_pickle( [](const c10::intrusive_ptr& params) -> SerializationType { // __getstate__ - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = params->unpack(); - return std::make_tuple(std::move(weight), std::move(bias)); + return params->unpack(); }, [](SerializationType state) -> c10::intrusive_ptr< @@ -501,10 +492,7 @@ TORCH_API int register_linear_params() { TORCH_CHECK(false, "Unknown qengine"); }) .def("bias", [](const c10::intrusive_ptr& self) { - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = self->unpack(); - return bias; + return std::get<1>(self->unpack()); }) .def("unpack", &LinearPackedParamsBase::unpack); // (1) we can't (easily) return the static initializer itself because it can have a different type because of selective build @@ -548,12 +536,7 @@ int register_embedding_params() { [](EmbeddingParamsSerializationType state) -> c10::intrusive_ptr { // __setstate__ call - std::vector tensors; - std::vector doubles; - std::vector longs; - // NOLINTNEXTLINE(cppcoreguidelines-init-variables) - int64_t version; - std::tie(version, tensors, doubles, longs) = std::move(state); + auto [version, tensors, doubles, longs] = std::move(state); TORCH_INTERNAL_ASSERT(tensors.size() == 1, "EmbeddingPackedParams: Expected weight tensor to be serialized"); TORCH_INTERNAL_ASSERT(longs.size() == 1, "EmbeddingPackedParams: Expected bit_rate to be serialized"); diff --git a/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp b/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp index 77c60141b0655..409f6e38d3e0b 100644 --- a/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp +++ b/aten/src/ATen/native/quantized/cpu/fused_obs_fake_quant.cpp @@ -41,8 +41,8 @@ void calculate_moving_average( } else { std::tie(x_min, x_max) = at::aminmax(x); } - const float* min_curr_val = x_min.data_ptr(); - const float* max_curr_val = x_max.data_ptr(); + const float* min_curr_val = x_min.const_data_ptr(); + const float* max_curr_val = x_max.const_data_ptr(); // Moving Average Min/Max observer for input tensor float* running_min_val = running_min.data_ptr(); float* running_max_val = running_max.data_ptr(); diff --git a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp index 45d3c9a864ced..dc9063ecf46f1 100644 --- a/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp +++ b/aten/src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp @@ -913,7 +913,7 @@ void qhardsigmoid_kernel(const Tensor& qx, Tensor& qy) { fVec kThreeVec(3.0f); fVec kSixVec(6.0f); - // Naive implemenentation: uses dequantize/execute/quantize routine + // Naive implementation: uses dequantize/execute/quantize routine cpu_kernel_vec( iter, [&](scalar_t qx) -> scalar_t { @@ -1070,7 +1070,7 @@ void qthreshold_kernel( Vec threshold_vec = Vec(threshold_float); Vec value_vec = Vec(value_float); - // Naive implemenentation: uses dequantize/execute/quantize routine + // Naive implementation: uses dequantize/execute/quantize routine cpu_kernel_vec( iter, [&](scalar_t value_qx) -> scalar_t { @@ -1152,7 +1152,7 @@ void qtanh_kernel(const Tensor& qx, Tensor& qy) { auto scale_neg_zp_premul_vec = scale_vec * zero_point_vec.neg(); AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qtanh", [&]() { - // Naive implemenentation: uses dequantize/execute/quantize routine + // Naive implementation: uses dequantize/execute/quantize routine // - Output scale is set to 2.0 / 2^(BIT_NUM) // - For signed types output zero point is set to 0 // - For unsigned types output zero point is set to (qmax + qmin) / 2.0 @@ -2734,7 +2734,7 @@ void fake_quantize_learnable_channel_grad_kernel_cpu( float grad_factor) { iter.for_each([&](char** data, const int64_t* strides, int64_t n) { /* To see how the input and outputs are referenced and assigned, - please see the implemenetation of + please see the implementation of fake_quantize_learnable_tensor_grad_kernel_cpu. */ for (const auto i : c10::irange(n)) { @@ -2797,8 +2797,8 @@ void quantized_normalize_kernel( "Unexpected size of beta"); scalar_t* X_data = X.data_ptr(); - const float* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const float* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const float* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const float* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; scalar_t* Y_data = Y->data_ptr(); const bool gamma_null = gamma_data == nullptr; const bool beta_null = beta_data == nullptr; @@ -3085,8 +3085,8 @@ void quantized_groupnorm_nhwc_kernel( "Unexpected size of beta"); scalar_t* X_data = X.data_ptr(); - const float* gamma_data = gamma.defined() ? gamma.data_ptr() : nullptr; - const float* beta_data = beta.defined() ? beta.data_ptr() : nullptr; + const float* gamma_data = gamma.defined() ? gamma.const_data_ptr() : nullptr; + const float* beta_data = beta.defined() ? beta.const_data_ptr() : nullptr; scalar_t* Y_data = Y->data_ptr(); const bool gamma_null = gamma_data == nullptr; const bool beta_null = beta_data == nullptr; @@ -3265,7 +3265,7 @@ void quantized_groupnorm_nhwc_kernel( // // We could fuse step 3 and 4 into a single session but this way is better: // a. D might be too small for vectorization; - // b. Avoid duplicate caculation of scale/bias, each HxW plain share the same scale/bias + // b. Avoid duplicate calculation of scale/bias, each HxW plain share the same scale/bias // for (const auto n : c10::irange(Bs)) { for (const auto g : c10::irange(G)) { @@ -3336,7 +3336,7 @@ void quantize_tensor_per_tensor_affine_cpu( AT_DISPATCH_QINT_TYPES( qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_cpu", [&]() { check_tensor_memory_format(rtensor, qtensor); - const float* rd = rtensor.data_ptr(); + const float* rd = rtensor.const_data_ptr(); auto qd = reinterpret_cast(qtensor.data_ptr()); // NOLINTNEXTLINE(cppcoreguidelines-pro-type-member-init) fbgemm::TensorQuantizationParams qparams; @@ -3668,7 +3668,7 @@ void quantize_tensor_per_tensor_affine_cpu( double scale, int64_t zero_point) { check_tensor_memory_format(rtensor, qtensor); - const float* rdata = rtensor.data_ptr(); + const float* rdata = rtensor.const_data_ptr(); int numel = rtensor.numel(); #if defined(__ARM_NEON__) || defined(__aarch64__) AT_DISPATCH_QINT_TYPES( @@ -3707,7 +3707,7 @@ void dequantize_tensor_per_tensor_affine_cpu( #if defined(__ARM_NEON__) || defined(__aarch64__) AT_DISPATCH_QINT_TYPES( qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_cpu", [&]() { - const scalar_t* qdata = qtensor.data_ptr(); + const scalar_t* qdata = qtensor.const_data_ptr(); auto dequantize_range = [&](int64_t begin, int64_t end) { dequantize_tensor_arm( qdata + begin, rdata + begin, end - begin, scale, zero_point); @@ -3722,7 +3722,7 @@ void dequantize_tensor_per_tensor_affine_cpu( // Fallback path AT_DISPATCH_QINT_TYPES( qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_cpu", [&]() { - const scalar_t* qdata = qtensor.data_ptr(); + const scalar_t* qdata = qtensor.const_data_ptr(); for (const auto i : c10::irange(numel)) { rdata[i] = dequantize_val(scale, zero_point, qdata[i]); } @@ -3743,7 +3743,7 @@ void quantize_tensor_per_channel_impl( // TODO: channels last kernel can be made faster. // For contiguous tensors, e.g. NCHW, arbitrary axis can be used. // For channels_last/3d however axis == 0 or 1. - // Since current implemntation on channels_last format does not + // Since current implementation on channels_last format does not // cover per channel quant with arbitrary axis value, it is better // to check and fail. int64_t batches = size_to_dim_(axis, rtensor.sizes()); @@ -3752,7 +3752,7 @@ void quantize_tensor_per_channel_impl( int64_t channels = rtensor.size(axis); auto scales_data = scales.data_ptr(); auto zero_points_data = zero_points.data_ptr(); - const float* in = rtensor.data_ptr(); + const float* in = rtensor.const_data_ptr(); auto out = qtensor.data_ptr(); if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || @@ -3804,7 +3804,7 @@ void quantize_tensor_per_channel_impl( int64_t channels = rtensor.size(axis); auto scales_data = scales.data_ptr(); auto zero_points_data = zero_points.data_ptr(); - const float* in = rtensor.data_ptr(); + const float* in = rtensor.const_data_ptr(); auto out = (uint8_t*)qtensor.data_ptr(); #if defined(__ARM_NEON__) // magic float and magic int to take care of rounding @@ -4008,7 +4008,7 @@ void dequantize_per_channel_affine_kernel( // For contiguous tensors, e.g. NCHW, arbitrary axis can be used. // For channels_last/3d however axis == 0 or 1. - // Since current implemntation on channels_last format does not + // Since current implementation on channels_last format does not // cover per channel quant with arbitrary axis value, it is better // to check and fail. TORCH_CHECK(rtensor.is_contiguous() || (axis <=1), @@ -4022,7 +4022,7 @@ void dequantize_per_channel_affine_kernel( auto scales_data = scales.data_ptr(); auto zero_points_data = zero_points.data_ptr(); check_tensor_memory_format(qtensor, rtensor); - const auto* qd = qtensor.data_ptr(); + const auto* qd = qtensor.const_data_ptr(); float* rd = rtensor.data_ptr(); const auto elem_per_byte = 8 / bit_width; if (axis == 1 && (rtensor.is_contiguous(MemoryFormat::ChannelsLast) || @@ -4084,7 +4084,7 @@ void quantize_tensor_per_channel_float_qparams_cpu( int64_t axis) { // For contiguous tensors, e.g. NCHW, arbitrary axis can be used. // For channels_last/3d however axis == 0 or 1. - // Since current implemntation on channels_last format does not + // Since current implementation on channels_last format does not // cover per channel quant with arbitrary axis value, it is better // to check and fail. TORCH_CHECK(rtensor.is_contiguous() || (axis <=1), @@ -4099,7 +4099,7 @@ void quantize_tensor_per_channel_float_qparams_cpu( auto scales_data = scales.data_ptr(); auto zero_points_data = zero_points.data_ptr(); check_tensor_memory_format(rtensor, qtensor); - const float* rdata = rtensor.data_ptr(); + const float* rdata = rtensor.const_data_ptr(); auto qdata = reinterpret_cast(qtensor.data_ptr()); const auto elem_per_byte = CHAR_BIT / bit_width; int qvalue = 0; @@ -4163,7 +4163,7 @@ void quantize_tensor_per_tensor_affine_sub_byte_cpu( AT_DISPATCH_QINT_AND_SUB_BYTE_TYPES( qtensor.scalar_type(), "quantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() { check_tensor_memory_format(rtensor, qtensor); - const float* const rdata = rtensor.data_ptr(); + const float* const rdata = rtensor.const_data_ptr(); auto qdata = reinterpret_cast(qtensor.data_ptr()); auto numel = rtensor.numel(); const auto elem_per_byte = CHAR_BIT / bit_width; @@ -4196,7 +4196,7 @@ void dequantize_tensor_per_tensor_affine_sub_byte_cpu( qtensor.scalar_type(), "dequantize_tensor_per_tensor_affine_sub_byte_cpu", [&]() { check_tensor_memory_format(rtensor, qtensor); auto rdata = rtensor.data_ptr(); - const underlying_t* qdata = reinterpret_cast(qtensor.data_ptr()); + const underlying_t* qdata = reinterpret_cast(qtensor.const_data_ptr()); auto numel = rtensor.numel(); const auto elem_per_byte = CHAR_BIT / bit_width; diff --git a/aten/src/ATen/native/quantized/cpu/qconv.cpp b/aten/src/ATen/native/quantized/cpu/qconv.cpp index 9f3c790d52c75..f915c014af143 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv.cpp @@ -647,7 +647,7 @@ at::Tensor PackedConvWeightsQnnp::apply_impl_xnnp( // create an empty tensor for packing the weights const at::Tensor weight_contig = orig_weight.contiguous(c10::MemoryFormat::ChannelsLast); - const float* w_scales_data = w_scales.data_ptr(); + const float* w_scales_data = w_scales.const_data_ptr(); underlying_t w_zp = 0; at::Tensor weight_tensor; @@ -1397,7 +1397,7 @@ static at::Tensor _quantized_convolution_onednn( torch::List dilation, bool transposed, int64_t groups, - double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant + double output_scale, int64_t output_zero_point, c10::optional accum, // accum to fused with conv add double accum_scale, @@ -1420,10 +1420,10 @@ static at::Tensor _quantized_convolution_onednn( bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16); if (fp32_output || bfloat16_output) { // When fp32 or bf16 output, oneDNN expects op_attr doesn't set_scales and set_zero_points. - // So, we will use default inv_output_scale as 1.0 and output_zero_point as 0, since - // when inv_output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep; + // So, we will use default output_scale as 1.0 and output_zero_point as 0, since + // when output_scale is 1.0, we will skip invoking of op_attr.set_scales in ideep; // when output_zero_point is 0, we will skip invoking of op_attr.set_zero_points in ideep. - TORCH_CHECK(inv_output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, inv_output_scale must be 1.0."); + TORCH_CHECK(output_scale == 1.0, " (ONEDNN): fp32 or bf16 output, output_scale must be 1.0."); TORCH_CHECK(output_zero_point == 0, " (ONEDNN): fp32 or bf16 output, output_zero_point must be 0"); } @@ -1504,9 +1504,6 @@ static at::Tensor _quantized_convolution_onednn( kSpatialDim, "D convolution."); // Parameters - // Scales of ONEDNN and PyTorch are reciprocal - const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale); - #if IDEEP_PREREQ(3, 1, 0, 1) // 1. If the weight scale generated by observer should with dtype float32 // https://github.com/pytorch/pytorch/blob/d2c24eca8a60c56b31ca967a44d5cc4522802aa6/torch/ao/quantization/observer.py#L323 @@ -1592,75 +1589,117 @@ static at::Tensor _quantized_convolution_onednn( output_sizes = at::native::conv_output_size(input_size, kernel_size, padding.vec(), stride.vec(), dilation.vec()); ideep::dims dst_dims = ideep::dims({output_sizes.cbegin(), output_sizes.cend()}); // Output is not a quantized tensor but data type is uint8 - at::Tensor output; - if (fp32_output || bfloat16_output) { - output = at::empty( + at::Tensor output = has_accum_postop_sum ? + accum.value() : + at::empty( dst_dims, device(c10::kCPU) - .dtype(fp32_output ? c10::kFloat : c10::kBFloat16) - .memory_format(kSpatialDim == 2 ? - c10::MemoryFormat::ChannelsLast : - c10::MemoryFormat::ChannelsLast3d), - c10::nullopt); - } else { - output = at::empty( - dst_dims, - device(c10::kCPU) - .dtype(c10::kByte) + .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte)) .memory_format(kSpatialDim == 2 ? c10::MemoryFormat::ChannelsLast : c10::MemoryFormat::ChannelsLast3d) ); - } if (output.numel() == 0) { return output; } - ideep::tensor dst; - if (has_accum_postop_sum) { - auto dst_desc = ideep::tensor::desc(dst_dims, fp32_output ? ideep::tensor::data_type::f32 : ( - bfloat16_output ? ideep::tensor::data_type::bf16 : src_data_type), - kSpatialDim == 2 ? ideep::format_tag::nhwc : ideep::format_tag::ndhwc); - TORCH_CHECK(accum.value().dtype() == output.dtype(), "The output tensor should have same dtype as the accum tensor."); - // When fused with sum, the dst tensor will share the data ptr as the accum tensor. - dst.init(dst_desc, accum.value().data_ptr()); - } else { - if (fp32_output || bfloat16_output) { - // Conv without add: int8-in, fp32-output - dst = ideep::tensor({dst_dims, fp32_output ? ideep::tensor::data_type::f32 : ideep::tensor::data_type::bf16, {output.strides().cbegin(), output.strides().cend()}}, - output.data_ptr()); - } else { - dst = ideep::tensor({dst_dims, ideep::tensor::data_type::u8, {output.strides().cbegin(), output.strides().cend()}}, - output.data_ptr()); - } + ideep::tensor dst = at::native::itensor_view_from_dense(output); + static ideep::tensor::desc dummy_accum_desc; + ideep::attr_t op_attr = onednn_utils::create_attr_by_post_op( + binary_attr.has_value() ? binary_attr.value() : "none", + binary_alpha.has_value() ? binary_alpha.value().to() : 1.0, + accum_scale, + accum_zero_point, + dummy_accum_desc, + unary_attr.has_value() ? unary_attr.value() : "none", + unary_scalars, + unary_algorithm.has_value() ? unary_algorithm.value() : "" + ); + +#if IDEEP_PREREQ(3, 1, 0, 0) + // Use oneDNN's APIs instead of prepare/compute from ideep to reduce integration overhead. + // The functions from ideep are heavy because they have complex data structures for unified API + // oneDNN version >= 3.1.0 is required. + using ideep::tensor; + auto weights_desc = packed_weight.get_desc(); + auto dst_desc = dst.get_desc(); + auto bias_desc = with_bias ? + tensor::desc(expected_bias.get_dims(), ideep::data_type::f32, ideep::format_tag::any) : + tensor::desc(); + if (act_scale != 1.0f) { + op_attr.set_scales_mask(DNNL_ARG_SRC, 0); } - ideep::attr_t op_attr; - // attr + if (act_zero_point != 0) { + op_attr.set_zero_points_mask(DNNL_ARG_SRC, 0); + } + int oc_per_group = packed_weight.get_dim(0) / groups; + int wei_scale_mask = ideep::utils::conv_weight_scale_mask(weight_scales.numel(), oc_per_group, groups, false); + op_attr.set_scales_mask(DNNL_ARG_WEIGHTS, wei_scale_mask); + if (output_scale != 1.0f) { + op_attr.set_scales_mask(DNNL_ARG_DST, 0); + } + if (output_zero_point != 0) { + op_attr.set_zero_points_mask(DNNL_ARG_DST, 0); + } + op_attr.set_scratchpad_mode(dnnl::scratchpad_mode::user); + auto engine = ideep::engine::cpu_engine(); + auto dilates_dnnl = ideep::utils::get_compatible_dilates(dilation.vec()); + auto primitive_desc = with_bias ? + dnnl::convolution_forward::primitive_desc( + engine, dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, + src_desc, weights_desc, bias_desc, dst_desc, + stride.vec(), dilates_dnnl, padding.vec(), padding.vec(), op_attr + ) : + dnnl::convolution_forward::primitive_desc( + engine, dnnl::prop_kind::forward_inference, dnnl::algorithm::convolution_direct, + src_desc, weights_desc, dst_desc, + stride.vec(), dilates_dnnl, padding.vec(), padding.vec(), op_attr + ); + auto primitive = dnnl::convolution_forward(primitive_desc); + + // Reorder weight if needed + auto expected_weight = packed_weight.reorder_if_differ_in(primitive_desc.weights_desc()); + + // Prepare args and execute primitive + tensor scratchpad(primitive_desc.scratchpad_desc()); + ideep::exec_args args; + args.insert({DNNL_ARG_SRC, src}); + args.insert({DNNL_ARG_WEIGHTS, expected_weight}); + args.insert({DNNL_ARG_DST, dst}); + args.insert({DNNL_ARG_SCRATCHPAD, scratchpad}); + if (with_bias) { + args.insert({DNNL_ARG_BIAS, expected_bias}); + } + tensor src_scales_t = tensor(ideep::scale_t(1, act_scale)); + tensor wei_scales_t = tensor(weights_scales); + tensor dst_scales_t = tensor(ideep::scale_t(1, output_scale)); + tensor src_zp_t = tensor(ideep::zero_point_t(1, act_zero_point)); + tensor dst_zp_t = tensor(ideep::zero_point_t(1, output_zero_point)); + if (act_scale != 1.0f) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_t}); + } + if (output_scale != 1.0f) { + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_t}); + } + args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_t}); + if (act_zero_point != 0) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, src_zp_t}); + } + if (output_zero_point != 0) { + args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_t}); + } + primitive.execute(ideep::stream::default_stream(), args); +#else + // Scales of ONEDNN and PyTorch are reciprocal + const ideep::scale_t& src_scales = ideep::scale_t(1, 1.0 / act_scale); + + // set accum scale/zero point to dst if (has_accum_postop_sum) { - op_attr = (has_unary_post_op && unary_attr.value()=="relu") ? ideep::attr_t::residual_with_sum_zero_point() : ideep::attr_t::fuse_sum(); const ideep::scale_t accum_ideep_scale = ideep::scale_t(1, 1.0/accum_scale); const ideep::zero_point_t accum_ideep_zero_points = ideep::zero_point_t(1, accum_zero_point); // Set the dst scale and zero point with the value of accum. - // The true scale and zero point is stored in ideep::scale_t(scale_size, inv_output_scale) and dst_zero_points. + // The true scale and zero point is stored in ideep::scale_t(scale_size, output_scale) and dst_zero_points. dst.set_scale(accum_ideep_scale); dst.set_zero_point(accum_ideep_zero_points); - } else { - if (has_unary_post_op && unary_attr.value()=="relu") { - op_attr = ideep::attr_t::fuse_relu(); - } else if (has_unary_post_op && unary_attr.value()=="hardtanh") { - TORCH_CHECK( - unary_scalars.size() == 2 && - unary_scalars[0].get().toOptional().has_value() && - unary_scalars[1].get().toOptional().has_value(), - "hardtanh is expected to have two scalar input: min_val and max_val"); - - auto lower_bound_value = - unary_scalars[0].get().toOptional().value().to(); - auto upper_bound_value = - unary_scalars[1].get().toOptional().value().to(); - op_attr = ideep::attr_t::fuse_clamp(lower_bound_value, upper_bound_value); - } else { - op_attr = ideep::attr_t(); - } } // Weight Reorder @@ -1668,7 +1707,7 @@ static at::Tensor _quantized_convolution_onednn( ideep::convolution_forward::prepare( params, src, packed_weight, expected_bias, dst_dims, dst, stride.vec(), dilation.vec(), padding.vec(), padding.vec(), groups, - src_scales, weights_scales, ideep::scale_t(1, inv_output_scale), + src_scales, weights_scales, ideep::scale_t(1, 1.0f / output_scale), src_zero_points, dst_zero_points, op_attr, dnnl::algorithm::convolution_direct, dnnl::prop_kind::forward_inference, @@ -1678,6 +1717,7 @@ static at::Tensor _quantized_convolution_onednn( // Computation ideep::convolution_forward::compute(params, src, expected_weight, expected_bias, dst); +#endif if (is_1d) { output.squeeze_(quant_utils::kConv1dSqueezeDim + 2); @@ -1832,7 +1872,7 @@ class QConvoneDNN final { torch::List padding, torch::List dilation, int64_t groups, - double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant + double output_scale, int64_t output_zero_point, c10::optional output_dtype, c10::string_view attr, @@ -1851,8 +1891,8 @@ class QConvoneDNN final { } else { // Conv2D post op check TORCH_CHECK( - attr == "none" || attr == "relu" || attr == "hardtanh", - "none post_op or post_op relu/hardtanh is supported for quantized pointwise conv2d. Got unary_post_op: ", + attr == "none" || attr == "relu" || attr == "hardtanh" || attr == "hardswish" || attr == "swish", + "none post_op or post_op relu/hardtanh/hardswish is supported for quantized pointwise conv2d. Got unary_post_op: ", attr, ".") } @@ -1860,7 +1900,7 @@ class QConvoneDNN final { act, act_scale, act_zero_point, weight, weight_scales, weight_zero_points, bias, stride, padding, dilation, /*transposed*/false, - groups, inv_output_scale, output_zero_point, + groups, output_scale, output_zero_point, /*accum*/c10::nullopt, /*accum_scale*/0.0, /*accum_zero_point*/0, /*output_dtype*/output_dtype, /*binary_attr*/c10::nullopt, /*binary_alpha*/c10::nullopt, /*unary_attr*/attr, /*unary_scalars*/scalars, /*unary_algorithm*/algorithm @@ -1884,7 +1924,7 @@ class QConvoneDNN final { torch::List padding, torch::List dilation, int64_t groups, - double inv_output_scale, // inv_output_scale is the reciprocal of scale in fake quant + double output_scale, int64_t output_zero_point, c10::optional output_dtype, c10::string_view binary_attr, @@ -1912,7 +1952,7 @@ class QConvoneDNN final { act, act_scale, act_zero_point, weight, weight_scales, weight_zero_points, bias, stride, padding, dilation, /*transposed*/false, - groups, inv_output_scale, output_zero_point, + groups, output_scale, output_zero_point, accum, accum_scale, accum_zero_point, /*output_dtype*/output_dtype, binary_attr, alpha, unary_attr, unary_scalars, unary_algorithm diff --git a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp index 6f996691c0946..46172f0c199f4 100644 --- a/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qconv_prepack.cpp @@ -283,9 +283,7 @@ c10::intrusive_ptr> PackedConvWeightsQnnp< auto kernel_dim = kSpatialDim == 2 ? std::vector{kernel_h, kernel_w} : std::vector{kernel_d, kernel_h, kernel_w}; - std::vector w_zero_points; - at::Tensor w_scales; - std::tie(w_zero_points, w_scales) = + auto [w_zero_points, w_scales] = make_zero_points_and_scales_tensor(weight_contig, transpose, groups); // We set the pre-packed conv weights to nullptr below as we call pre-pack // during the first invocation of operator run. Refer to qconv.cpp for more diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp index 0057fea54c2e0..7e5083057a0ba 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag.cpp @@ -491,7 +491,7 @@ at::Tensor& embedding_bag_byte_impl( /*offsets_or_lengths=*/offsets_data + start_idx, /*weights=*/ per_sample_weights_ - ? per_sample_weights_.value().data_ptr() + + ? per_sample_weights_.value().const_data_ptr() + offsets_data[start_idx] : nullptr, /*out=*/output_data + start_idx * D); diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp index 763ac7c784c83..9cfbce72e31d1 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_prepack.cpp @@ -436,8 +436,7 @@ Tensor _qembeddingbag_nbit_prepack_helper( // NOLINTNEXTLINE(cppcoreguidelines-init-variables) float Xmin, Xmax; if (optimized_qparams) { - at::Tensor xmax_tensor, xmin_tensor; - std::tie(xmax_tensor, xmin_tensor) = at::choose_qparams_optimized( + auto [xmax_tensor, xmin_tensor] = at::choose_qparams_optimized( float_weight[row], embedding_cols, nbins, ratio, bit_width); TORCH_CHECK( xmax_tensor.numel() == 1 && xmin_tensor.numel() == 1, diff --git a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp index 3612f8eba2f88..7c1093a1c4c1a 100644 --- a/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qembeddingbag_unpack.cpp @@ -37,7 +37,7 @@ at::Tensor PackedEmbeddingBagWeight::unpack() { scale_bias_bytes = 4; } - const auto* input = packed_weight.data_ptr(); + const auto* input = packed_weight.const_data_ptr(); // Calculate the output shape, accounting for the last n bytes to be used // for scale/bias rest of the entries are packed depending on the bit_width. std::vector output_shape = { @@ -125,7 +125,7 @@ Tensor& qembeddingbag_byte_unpack_out(Tensor& output, const Tensor& packed_weigh // The last 2 values are used to store the FP32 scale and zero_point values // per row. const int32_t output_columns = input_columns - 2 * sizeof(float); - const auto* input_data = packed_weight.data_ptr(); + const auto* input_data = packed_weight.const_data_ptr(); std::vector output_shape = packed_weight_sizes.vec(); output_shape[col_dim] = output_columns; @@ -187,7 +187,7 @@ Tensor _qembeddingbag_nbit_unpack_helper( int BIT_RATE) { const auto input_rows = packed_weight.size(0); const auto input_columns = packed_weight.size(1); - const auto* input_data = packed_weight.data_ptr(); + const auto* input_data = packed_weight.const_data_ptr(); int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; // The last 4 bytes per row are two fp16 scale and zero_point. diff --git a/aten/src/ATen/native/quantized/cpu/qgelu.cpp b/aten/src/ATen/native/quantized/cpu/qgelu.cpp index f9a3c32343df7..743832431e0c4 100644 --- a/aten/src/ATen/native/quantized/cpu/qgelu.cpp +++ b/aten/src/ATen/native/quantized/cpu/qgelu.cpp @@ -18,4 +18,12 @@ Tensor gelu_quantized_cpu(const Tensor& qx, c10::string_view approximate) { qgelu_stub(qx.device().type(), qx, qy, get_gelutype_enum(approximate)); return qy; } + +Tensor& gelu_quantized_cpu_(Tensor& self, c10::string_view approximate) { + Tensor qy = gelu_quantized_cpu(self, approximate); + // This can be optimized in a future PR if it becomes a bottleneck. + self.copy_(qy); + return self; +} + }} // namespace at::native diff --git a/aten/src/ATen/native/quantized/cpu/qlinear.cpp b/aten/src/ATen/native/quantized/cpu/qlinear.cpp index 78e4551119ddb..df6df3c35201d 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear.cpp @@ -123,6 +123,8 @@ at::Tensor& PackedLinearWeight::apply_impl( // Allocate a buffer for fbgemmPacked to use auto buffer = at::empty(out_sizes, output.options().dtype(at::kInt)); + auto output_data = reinterpret_cast(output.data_ptr()); + int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { for (const auto task_id : c10::irange(begin, end)) { @@ -184,7 +186,7 @@ at::Tensor& PackedLinearWeight::apply_impl( fbgemm::fbgemmPacked( /*packA=*/packA, /*packB=*/*packB, - /*C=*/reinterpret_cast(output.data_ptr()), + /*C=*/output_data, /*C_buffer=*/buffer.data_ptr(), /*ldc=*/N, /*outProcess=*/outputProcObj, @@ -220,7 +222,7 @@ at::Tensor& PackedLinearWeight::apply_impl( fbgemm::fbgemmPacked( /*packA=*/packA, /*packB=*/*packB, - /*C=*/reinterpret_cast(output.data_ptr()), + /*C=*/output_data, /*C_buffer=*/buffer.data_ptr(), /*ldc=*/N, /*outProcess=*/outputProcObj, @@ -314,7 +316,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl fbgemm::fbgemmSupportedCPU(), "Your CPU does not support FBGEMM."); auto input_contig = input.expect_contiguous(); - const auto* input_ptr = input_contig->data_ptr(); + const auto* input_ptr = input_contig->const_data_ptr(); TORCH_CHECK( input.dim() >= 2, @@ -358,6 +360,8 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl output.options().dtype(at::kInt), LEGACY_CONTIGUOUS_MEMORY_FORMAT); + auto output_data = output.data_ptr(); + int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { fbgemm::PackAWithQuantRowOffset packA( @@ -396,7 +400,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl fbgemm::fbgemmPacked( /*packA=*/packA, /*packB=*/*packB, - /*C=*/output.data_ptr(), + /*C=*/output_data, /*C_buffer=*/buffer.data_ptr(), /*ldc=*/N, /*outProcess=*/outputProcObj, @@ -428,7 +432,7 @@ at::Tensor PackedLinearWeight::apply_with_input_q_dq_qweight_dq_output_fp32_impl fbgemm::fbgemmPacked( /*packA=*/packA, /*packB=*/*packB, - /*C=*/output.data_ptr(), + /*C=*/output_data, /*C_buffer=*/buffer.data_ptr(), /*ldc=*/N, /*outProcess=*/outputProcObj, @@ -481,7 +485,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp( xnn_operator_t xnnp_op = nullptr; - const float* weight_scales_data = w_scales.data_ptr(); + const float* weight_scales_data = w_scales.const_data_ptr(); // prepare weights underlying_t w_zp = static_cast( @@ -586,7 +590,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_impl_xnnp( status, ")"); - // Run the opeator + // Run the operator status = xnn_run_operator( xnnp_linear_op.get(), // Linear op caffe2::pthreadpool_() // threadpool @@ -917,24 +921,61 @@ static at::Tensor linear_int8_with_onednn_weight( double output_scale, int64_t output_zero_point, c10::optional output_dtype, - std::string& post_op_name, // e.g. "none", "relu" - torch::List>& post_op_args, - std::string& post_op_algorithm) { + c10::optional other, // extra input for binary post-op + double other_scale, + int64_t other_zero_point, + const c10::string_view& binary_post_op, // e.g. "none", "sum", "add" + double binary_alpha, + const c10::string_view& unary_post_op, // e.g. "none", "relu" + torch::List>& unary_post_op_args, + c10::string_view& unary_post_op_algorithm) { using ideep::tensor; const int64_t dim = input.dim(); - output_scale = 1.0f / output_scale; TORCH_CHECK(input.scalar_type() == c10::ScalarType::Byte, "qlinear with mkldnn tensor: data type of input should be uint8 (unsigned char)."); TORCH_CHECK(onednn_weight.scalar_type() == c10::ScalarType::Char, "qlinear with mkldnn tensor: data type of weight should be int8 (char)."); TORCH_CHECK( weight_scales.scalar_type() == c10::ScalarType::Float, "weight scales should be dtype c10::ScalarType::Float."); + TORCH_CHECK( + binary_alpha == 1.0f, "onednn qlinear: alpha != 1 for binary post op is not yet supported."); bool fp32_output = output_dtype.has_value() && (output_dtype.value() == c10::kFloat); - bool bfloat16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16); - if (fp32_output || bfloat16_output) { + bool bf16_output = output_dtype.has_value() && (output_dtype.value() == c10::kBFloat16); + if (fp32_output || bf16_output) { TORCH_CHECK( output_scale == 1.0f && output_zero_point == 0, "onednn qlinear: expect scale=1 and zero point=0 for fp32 output"); } + if (binary_post_op != "none") { + /* Supported cases for binary post op: + +-------------------+--------------+---------------+ + | Extra input dtype | Output dtype | Post op | + +-------------------+--------------+---------------+ + | Fp32/bf16 | fp32/bf16 | sum | + +-------------------+--------------+---------------+ + | Fp32/bf16 | int8 | add | + +-------------------+--------------+---------------+ + | int8 | fp32/bf16 | not supported | + +-------------------+--------------+---------------+ + | int8 | int8 | sum | + +-------------------+--------------+---------------+ + */ + TORCH_CHECK(other.has_value(), "onednn qlinear: the extra input is missing for post op ", binary_post_op); + if (fp32_output || bf16_output) { + TORCH_CHECK( + other_scale == 1.0f && other_zero_point == 0, + "onednn qlinear: expect extra input scale = 1.0 and zero point = 0 when output dtype is ", output_dtype.value(), + ", but got ", other_scale, " and ", other_zero_point, ", respectively" + ); + } + if (binary_post_op == "sum") { + auto expected_dtype = output_dtype.has_value() ? output_dtype.value() : c10::kByte; + TORCH_CHECK( + other.value().scalar_type() == expected_dtype, + "onednn qlinear: the dtype of extra input for binary post op should be ", expected_dtype, + " (same as output dtype), but got ", other.value().scalar_type() + ); + } + } // If the input has more than two dimensions, we will reshape it to a 2-dimensional form // for calculation and subsequently reshape the output back. @@ -962,35 +1003,45 @@ static at::Tensor linear_int8_with_onednn_weight( } std::vector src_dims = {M, K}; std::vector dst_dims = {M, N}; - at::Tensor output = at::empty( - dst_dims, - device(c10::kCPU) - .dtype(fp32_output ? c10::kFloat : (bfloat16_output ? c10::kBFloat16 : c10::kByte)) - ); + at::Tensor output = binary_post_op == "sum" ? + other.value() : + at::empty( + dst_dims, + device(c10::kCPU) + .dtype(fp32_output ? c10::kFloat : (bf16_output ? c10::kBFloat16 : c10::kByte)) + ); if (output.numel() == 0) { return output; } tensor dst = at::native::itensor_view_from_dense(output); + static tensor empty_tensor; + static tensor::desc empty_tensor_desc; + tensor src1 = binary_post_op == "add" ? + at::native::itensor_view_from_dense(other.value().reshape({-1, other.value().size(dim - 1)})) : + empty_tensor; // Create onednn primitive auto src_desc = tensor::desc(src_dims, ideep::data_type::u8, ideep::format_tag::any); auto weights_desc = packed_weight.get_desc(); - auto dst_dtype = fp32_output ? ideep::data_type::f32 : (bfloat16_output ? ideep::tensor::data_type::bf16 : ideep::data_type::u8); + auto dst_dtype = dst.get_data_type(); auto dst_desc = tensor::desc(dst_dims, dst_dtype, ideep::format_tag::any); auto bias_desc = with_bias ? tensor::desc(onednn_bias.value().get_dims(), ideep::data_type::f32, ideep::format_tag::any) : - tensor::desc(); - dnnl::algorithm post_op_algo = dnnl::algorithm::undef; - if (post_op_name == "gelu") { - if (post_op_algorithm == "none") { - post_op_algo = dnnl::algorithm::eltwise_gelu_erf; - } else if (post_op_algorithm == "tanh") { - post_op_algo = dnnl::algorithm::eltwise_gelu_tanh; - } else { - TORCH_CHECK(false, "un-supported GELU approximate, none or tanh is supported."); - } - } - auto op_attr = onednn_utils::create_attr_by_post_op(post_op_name, post_op_args, post_op_algo); + empty_tensor_desc; + // Get op attr for primitive + // Note: output_scale & output_zero_point are for re-quantization of the final output. + // And other_scale & other_zero_point are for dequantization of other. + auto other_desc = binary_post_op == "add" ? src1.get_desc() : empty_tensor_desc; + auto op_attr = onednn_utils::create_attr_by_post_op( + binary_post_op, + binary_alpha, + other_scale, + other_zero_point, + other_desc, + unary_post_op, + unary_post_op_args, + unary_post_op_algorithm + ); if (input_scale != 1.0f) { op_attr.set_scales_mask(DNNL_ARG_SRC, 0); } @@ -1042,6 +1093,9 @@ static at::Tensor linear_int8_with_onednn_weight( if (output_zero_point != 0) { args.insert({DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, dst_zp_t}); } + if (binary_post_op == "add") { + args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1, src1}); + } primitive.execute(ideep::stream::default_stream(), args); return dim == 2 ? output : output.reshape(output_size); } @@ -1144,21 +1198,121 @@ class QLinearOnednn final { double output_scale, int64_t output_zero_point, c10::optional output_dtype, - std::string post_op_name, + c10::string_view post_op_name, torch::List> post_op_args, - std::string post_op_algorithm) { + c10::string_view post_op_algorithm) { #if AT_MKLDNN_ENABLED() + static c10::optional other = c10::nullopt; + static const c10::string_view binary_post_op = "none"; return linear_int8_with_onednn_weight( act, act_scale, act_zero_point, onednn_weight, weight_scales, weight_zero_points, bias, output_scale, output_zero_point, output_dtype, + other, /*other scale*/1.0, /*other zp*/0, + binary_post_op, /*binary alpha*/1.0, post_op_name, post_op_args, post_op_algorithm ); #endif TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)"); } -}; + static Tensor run_pointwise_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + c10::optional bias, + double output_scale, + int64_t output_zero_point, + c10::optional output_dtype, + c10::string_view post_op_name, + torch::List> post_op_args, + c10::string_view post_op_algorithm) { +#if AT_MKLDNN_ENABLED() + TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1, + "onednn int8 linear: act scale/zp size should be 1"); + static c10::optional other = c10::nullopt; + static const c10::string_view binary_post_op = "none"; + return linear_int8_with_onednn_weight( + act, act_scale.item().toDouble(), act_zero_point.item().toLong(), + onednn_weight, weight_scales, weight_zero_points, + bias, output_scale, output_zero_point, output_dtype, + other, /*other scale*/1.0, /*other zp*/0, + binary_post_op, /*binary alpha*/1.0, + post_op_name, post_op_args, post_op_algorithm + ); +#endif + TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)"); + } + + static Tensor run_pointwise_binary( + Tensor act, // int8 CPU tensor, not QTensor + double act_scale, + int64_t act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + c10::optional bias, + double output_scale, + int64_t output_zero_point, + c10::optional output_dtype, + c10::optional other, // extra input for binary post-op + double other_scale, + int64_t other_zero_point, + c10::string_view binary_post_op, // e.g. "none", "sum", "add" + double binary_alpha, + c10::string_view unary_post_op, // e.g. "none", "relu" + torch::List> unary_post_op_args, + c10::string_view unary_post_op_algorithm) { +#if AT_MKLDNN_ENABLED() + return linear_int8_with_onednn_weight( + act, act_scale, act_zero_point, + onednn_weight, weight_scales, weight_zero_points, + bias, output_scale, output_zero_point, output_dtype, + other, other_scale, other_zero_point, + binary_post_op, binary_alpha, + unary_post_op, unary_post_op_args, unary_post_op_algorithm + ); +#endif + TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)"); + } + + static Tensor run_pointwise_binary_tensor( + Tensor act, // int8 CPU tensor, not QTensor + Tensor act_scale, + Tensor act_zero_point, + Tensor onednn_weight, // int8 tensor from MkldnnCPU + Tensor weight_scales, + Tensor weight_zero_points, + c10::optional bias, + double output_scale, + int64_t output_zero_point, + c10::optional output_dtype, + c10::optional other, // extra input for binary post-op + double other_scale, + int64_t other_zero_point, + c10::string_view binary_post_op, // e.g. "none", "sum", "add" + double binary_alpha, + c10::string_view unary_post_op, // e.g. "none", "relu" + torch::List> unary_post_op_args, + c10::string_view unary_post_op_algorithm) { +#if AT_MKLDNN_ENABLED() + TORCH_CHECK(act_scale.numel() == 1 && act_zero_point.numel() == 1, + "onednn int8 linear: act scale/zp size should be 1"); + return linear_int8_with_onednn_weight( + act, act_scale.item().toDouble(), act_zero_point.item().toLong(), + onednn_weight, weight_scales, weight_zero_points, + bias, output_scale, output_zero_point, output_dtype, + other, other_scale, other_zero_point, + binary_post_op, binary_alpha, + unary_post_op, unary_post_op_args, unary_post_op_algorithm + ); +#endif + TORCH_CHECK(false, "Unimplemented (int8 linear with packed weight and bias)"); + } +}; TORCH_LIBRARY_IMPL(quantized, QuantizedCPU, m) { register_linear_params(); @@ -1181,6 +1335,12 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) { TORCH_LIBRARY_IMPL(onednn, MkldnnCPU, m) { m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise"), TORCH_FN(QLinearOnednn::run_pointwise)); + m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.tensor"), + TORCH_FN(QLinearOnednn::run_pointwise_tensor)); + m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary"), + TORCH_FN(QLinearOnednn::run_pointwise_binary)); + m.impl(TORCH_SELECTIVE_NAME("onednn::qlinear_pointwise.binary_tensor"), + TORCH_FN(QLinearOnednn::run_pointwise_binary_tensor)); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp index f871877073a75..935ad081bd908 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -16,6 +16,9 @@ #include #include #include +#include +#include +#include #include #endif @@ -43,7 +46,7 @@ at::Tensor PackedLinearWeight::apply_dynamic_impl( // TODO: contiguous is called for further jit optimizations. auto input_contig = input.contiguous(); - const auto* input_ptr = input_contig.data_ptr(); + const auto* input_ptr = input_contig.const_data_ptr(); TORCH_CHECK( input.dim() >= 2, @@ -266,7 +269,7 @@ at::Tensor PackedLinearWeightsQnnp::apply_dynamic_impl( TORCH_CHECK(bias_vec.dim() == 1, "bias should be a vector (1D Tensor)"); auto bias_contig = bias_vec.contiguous(); - const float* bias_ptr = bias_contig.data_ptr(); + const float* bias_ptr = bias_contig.const_data_ptr(); // Calculate statistics for quantization of input Tensor // TODO: optimized kernel @@ -407,7 +410,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl( const at::Tensor& input, at::Tensor& output) { const at::Tensor input_contig = input.contiguous(); - const float* input_ptr = input_contig.data_ptr(); + const float* input_ptr = input_contig.const_data_ptr(); auto& packed_weight_fp16 = *w; @@ -423,6 +426,8 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl( // Resize output Tensor output.resize_(output_sizes); + auto output_data = output.data_ptr(); + int num_tasks = at::get_num_threads(); at::parallel_for(0, num_tasks, 1, [&](int64_t begin, int64_t end) { for (const auto task_id : c10::irange(begin, end)) { @@ -433,7 +438,7 @@ at::Tensor& PackedLinearWeightFp16::apply_dynamic_impl( /*A=*/input_ptr, /*Bp=*/packed_weight_fp16, /*beta=*/0.0f, - /*C=*/output.data_ptr(), + /*C=*/output_data, /*thread_id=*/static_cast(task_id), /*num_threads=*/num_tasks); } @@ -520,8 +525,7 @@ at::Tensor PackedLinearWeightsOnednn::apply_dynamic_impl( /*len=*/input.numel()); #else if (input_contig.numel() > 0) { - Tensor t_min, t_max; - std::tie(t_min, t_max) = at::aminmax(input_contig); + auto [t_min, t_max] = at::aminmax(input_contig); x_max = t_max.item(); x_min = t_min.item(); } @@ -659,6 +663,122 @@ class QLinearDynamicFp16 final { #endif // USE_FBGEMM }; +class QLinearUnpackedDynamicFp16 final { + public: +#ifdef USE_FBGEMM + static at::Tensor run( + at::Tensor input, + const at::Tensor& weight, + const at::Tensor& bias) { + // We make a strong guarantee that models using these operators will have + // the same numerics across different machines. Therefore, we do not provide + // a fallback path and rather fail loudly if we cannot run FBGEMM. + TORCH_CHECK( + fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM."); + + TORCH_CHECK( + weight.dim() == 2, + "The dimension of weight tensor should be equal to 2"); + + auto packed_weight = PackedLinearWeightFp16::prepack(weight, bias); + auto output = packed_weight->apply_dynamic(std::move(input)); + + return output; + } + + static at::Tensor meta( + at::Tensor input, + const at::Tensor& weight, + const at::Tensor& bias) { + // We make a strong guarantee that models using these operators will have + // the same numerics across different machines. Therefore, we do not provide + // a fallback path and rather fail loudly if we cannot run FBGEMM. + TORCH_CHECK( + fbgemm::fbgemmSupportedCPU(), "Your CPU doesn't support FBGEMM."); + + TORCH_CHECK( + weight.dim() == 2, + "The dimension of weight tensor should be equal to 2"); + + auto out_channel = weight.sym_sizes().vec()[0]; + auto out_sizes = input.sym_sizes().vec(); + out_sizes[out_sizes.size() - 1] = out_channel; + + return at::empty_symint(out_sizes, input.options()); + } +#else // USE_FBGEMM + static at::Tensor run( + at::Tensor /* input */, + const at::Tensor& weight, + const at::Tensor& bias) { + // We make a strong guarantee that models using these operators will have + // the same numerics across different machines. Therefore, we do not provide + // a fallback path and rather fail loudly if we cannot run FBGEMM. + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); + } + + static at::Tensor meta( + at::Tensor /* input */, + const at::Tensor& weight, + const at::Tensor& bias) { + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); + } +#endif // USE_FBGEMM +}; + +at::Tensor wrapped_fbgemm_pack_gemm_matrix_fp16(const at::Tensor& weight) { +#ifdef USE_FBGEMM + TORCH_CHECK( + weight.dim() == 2, + "fbgemm weight packing only packs matrices not vectors."); + return at::native::fbgemm_pack_gemm_matrix_fp16(weight); +#else // USE_FBGEMM + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +#endif // USE_FBGEMM +} + +at::Tensor wrapped_fbgemm_pack_gemm_matrix_fp16_meta(const at::Tensor& weight) { +#ifdef USE_FBGEMM + // Strictly speaking this is not correct. However we do not know the exact + // size of the packed matrix as it's being maintained by the object itself, + // therefore we return the view we have here. + return at::empty({8}, weight.options().dtype(at::kByte)); +#else // USE_FBGEMM + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +#endif // USE_FBGEMM +} + +at::Tensor wrapped_fbgemm_linear_fp16_weight(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, int64_t out_channel) { +#ifdef USE_FBGEMM + return at::native::fbgemm_linear_fp16_weight(input, weight, bias); +#else // USE_FBGEMM + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +#endif // USE_FBGEMM +} + +at::Tensor wrapped_fbgemm_linear_fp16_weight_meta(const at::Tensor& input, const at::Tensor& weight, const at::Tensor& bias, int64_t out_channel) { +#ifdef USE_FBGEMM + // For the meta function, we need users to provide the dimension explicitly + // as we don't have access to the weight. + auto out_sizes = input.sym_sizes().vec(); + if (out_channel == -1) { + out_sizes.pop_back(); + } else { + out_sizes.back() = out_channel; + } + return at::empty_symint(out_sizes, input.options()); +#else // USE_FBGEMM + TORCH_CHECK( + false, "This PyTorch installation was not built with FBGEMM operators"); +#endif // USE_FBGEMM +} + + TORCH_LIBRARY_IMPL(quantized, CPU, m) { register_linear_params(); m.impl( @@ -670,16 +790,40 @@ TORCH_LIBRARY_IMPL(quantized, CPU, m) { m.impl( TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16::run)); + m.impl( + TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16_unpacked_weight"), + TORCH_FN(QLinearUnpackedDynamicFp16::run)); m.impl( TORCH_SELECTIVE_NAME("quantized::linear_relu_dynamic_fp16"), TORCH_FN(QLinearDynamicFp16::run)); } +TORCH_LIBRARY_IMPL(quantized, Meta, m) { + m.impl( + TORCH_SELECTIVE_NAME("quantized::linear_dynamic_fp16_unpacked_weight"), + TORCH_FN(QLinearUnpackedDynamicFp16::meta)); +} + TORCH_LIBRARY_IMPL(_quantized, CPU, m) { register_linear_params(); m.impl( TORCH_SELECTIVE_NAME("_quantized::linear_dynamic"), TORCH_FN(QLinearDynamicInt8::run)); + m.impl( + TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16"), + wrapped_fbgemm_pack_gemm_matrix_fp16); + m.impl( + TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_linear_fp16_weight"), + wrapped_fbgemm_linear_fp16_weight); +} + +TORCH_LIBRARY_IMPL(_quantized, Meta, m) { + m.impl( + TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16"), + wrapped_fbgemm_pack_gemm_matrix_fp16_meta); + m.impl( + TORCH_SELECTIVE_NAME("_quantized::wrapped_fbgemm_linear_fp16_weight"), + wrapped_fbgemm_linear_fp16_weight_meta); } } // namespace diff --git a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp index 50cca56a0284d..a2fb34f90b289 100644 --- a/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp +++ b/aten/src/ATen/native/quantized/cpu/qlinear_prepack.cpp @@ -157,9 +157,7 @@ c10::intrusive_ptr PackedLinearWeightsQnnp::prepack( " instead"); at::Tensor weight_contig = weight.contiguous(); - std::vector w_zero_points; - at::Tensor w_scales; - std::tie(w_zero_points, w_scales) = + auto [w_zero_points, w_scales] = make_zero_points_and_scales_tensor(weight_contig); at::native::initQNNPACK(); diff --git a/aten/src/ATen/native/quantized/cpu/qmul.cpp b/aten/src/ATen/native/quantized/cpu/qmul.cpp index d7207ccdf5463..fe997c7a42b6c 100644 --- a/aten/src/ATen/native/quantized/cpu/qmul.cpp +++ b/aten/src/ATen/native/quantized/cpu/qmul.cpp @@ -100,8 +100,8 @@ Tensor _mul_out_xnnpack( if(ReLUFused) { /* - * FIXME: use acticationLimits() - * With , MSVC runs into "error C3862: indetifier activationLimits not + * FIXME: use activationLimits() + * With , MSVC runs into "error C3862: identifier activationLimits not * found". */ constexpr int64_t qmin = std::numeric_limits::min(); diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl index ea83f48da0f70..0f822afd6da3c 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/buckbuild.bzl @@ -336,6 +336,7 @@ def define_qnnpack(third_party, labels = []): ":ukernels_sse2", ":ukernels_sse41", ":ukernels_ssse3", + third_party("clog"), third_party("cpuinfo"), third_party("FP16"), third_party("FXdiv"), diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc index c6b7d8cb42049..863ab37e2b2c5 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/conv-run.cc @@ -400,7 +400,7 @@ enum pytorch_qnnp_status qnnpackConv( threadpool); if (status != pytorch_qnnp_status_success) { pytorch_qnnp_log_error( - "failed to run covolution op setup to setup indirection buffer."); + "failed to run convolution op setup to setup indirection buffer."); return status; } } diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc index 87e3fd2000775..1660bd61e205c 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/deconv-run.cc @@ -168,7 +168,7 @@ enum pytorch_qnnp_status qnnpackDeConv( threadpool); if (status != pytorch_qnnp_status_success) { pytorch_qnnp_log_error( - "failed to run decovolution op setup to setup indirection buffer."); + "failed to run deconvolution op setup to setup indirection buffer."); return status; } } diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c index 71226ab5250e1..857d78e57625c 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/fully-connected-sparse.c @@ -72,7 +72,7 @@ enum pytorch_qnnp_status pytorch_qnnp_create_fully_connected_sparse_dq_nc_q8( if (kernel_row_block_size == 8 && kernel_col_block_size == 1) { // This is to gate 8x1 on SSE2 since we have not implemented SSE2 - // kernel that suppors 8x1 sparsity pattern. + // kernel that supports 8x1 sparsity pattern. if (pytorch_qnnp_params.q8gemm_sparse_c8x1.packA == NULL) { status = pytorch_qnnp_status_invalid_parameter; goto error; diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c index 86432e6c1b242..232c015338bbe 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/indirection.c @@ -208,7 +208,7 @@ void pytorch_qnnp_indirection_init_conv3d( * width * * step_height: The number of pointers to traverse to move from an output - * pixel's first input's index in the indirection bufffer to that of the + * pixel's first input's index in the indirection buffer to that of the * output pixel one ROW (one output y) after it. * i.e. if indirection_buffer[j] points to the first input pixel used to * compute the i'th output pixel, then diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c b/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c index b2ea18c669c67..b3f1cf40fcc33 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/init.c @@ -81,7 +81,7 @@ static void init(void) { .packA = pytorch_q8gemm_sparse_packA_ukernel_4x4__aarch32_neon, .mr = 4, .nr = 8, - .kr = 4, // kr is really 1 but we set it to 4 because we resuse 4x4 prepacking kernel + .kr = 4, // kr is really 1 but we set it to 4 because we reuse 4x4 prepacking kernel .log2_mr = 2, .log2_row_block_size = 3, .row_block_size = 8, @@ -193,7 +193,7 @@ static void init(void) { .packA = pytorch_q8gemm_sparse_packA_ukernel_8x4__aarch64_neon, .mr = 8, .nr = 8, - .kr = 4, // kr is really 1 but we set it to 4 because we resuse 4x4 prepacking kernel + .kr = 4, // kr is really 1 but we set it to 4 because we reuse 4x4 prepacking kernel .log2_mr = 3, .log2_row_block_size = 3, .row_block_size = 8, diff --git a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h index 67684d7fa40c7..14ea256124856 100644 --- a/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h +++ b/aten/src/ATen/native/quantized/cpu/qnnpack/src/qnnpack/pack.h @@ -116,7 +116,7 @@ static inline void pytorch_pack_q8gemm_wrq( } if (kzp != 0) { // This part fills the packed weights with zero points for output channels - // when they are not divisble by nr blocking parameter. + // when they are not divisible by nr blocking parameter. // This is needed because in some kernels, sse2 ones, it relies on this // to produce zero as a result of subtracting zero point from weight value. size_t remaining_nr_blocks = ((nr - nr_block_size) & (np - 1)); @@ -239,7 +239,7 @@ static inline void pytorch_pack_q8conv_wrq( } if (kzp != 0) { // This part fills the packed wights with zero points for output channels - // when they are not divisble by nr blocking parameter. + // when they are not divisible by nr blocking parameter. // In that case for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size); nr_block_offset++) { @@ -361,7 +361,7 @@ static inline void pytorch_pack_q8deconv_wrq( } if (kzp != 0) { // This part fills the packed wights with zero points for output channels - // when they are not divisble by nr blocking parameter. + // when they are not divisible by nr blocking parameter. // In that case for (size_t nr_block_offset = 0; nr_block_offset < (nr - nr_block_size); nr_block_offset++) { diff --git a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp index 92703f322c29b..159da6e72febe 100644 --- a/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp +++ b/aten/src/ATen/native/quantized/cpu/qsigmoid.cpp @@ -108,7 +108,7 @@ Tensor sigmoid_quantized_cpu(const Tensor& qx) { #endif // USE_PYTORCH_QNNPACK Tensor qy; AT_DISPATCH_QINT_TYPES(qx.scalar_type(), "qsigmoid", [&]() { - // Naive implemenentation: uses dequantize/execute/quantize routine + // Naive implementation: uses dequantize/execute/quantize routine // - Output scale is set to 1.0 / 2^(BIT_NUM) // - For signed types output zero point is set to 0 // - For unsigned types output zero point is set to (qmax + qmin) / 2.0 diff --git a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu index 0580c47b8c627..3574bfe28f505 100644 --- a/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu +++ b/aten/src/ATen/native/quantized/cuda/EmbeddingBag.cu @@ -545,7 +545,7 @@ Tensor qembeddingbag_4bit_unpack(const Tensor& packed_weight) { int BIT_RATE = 4; const auto input_rows = packed_weight.size(0); const auto input_columns = packed_weight.size(1); - const auto* input_data = packed_weight.data_ptr(); + const auto* input_data = packed_weight.const_data_ptr(); int NUM_ELEM_PER_BYTE = 8 / BIT_RATE; // The last 4 bytes per row are two fp16 scale and zero_point. diff --git a/aten/src/ATen/native/quantized/cudnn/Conv.cpp b/aten/src/ATen/native/quantized/cudnn/Conv.cpp index 4cb496640746f..bb97a69859cb4 100644 --- a/aten/src/ATen/native/quantized/cudnn/Conv.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Conv.cpp @@ -75,7 +75,7 @@ void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& qua if (bias_.has_value()) { // the input bias is a 1-D tensor whose size is the same as the size of the second dimension of quantized_output. // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. - // the number of trailling dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias + // the number of trailing dimensions is quantized_output.dim() - 2, so the new size of the broadcast_bias // becomes quantized_output.dim() - 2 + 1. nothing needs to be done for the leading dimensions std::vector new_size(quantized_output.dim() - 1, 1); new_size[0] = bias_.value().size(0); @@ -157,7 +157,7 @@ void PackedConvWeightCudnn::apply_impl_helper(const at::Tensor& qua c10::optional bias_mult_op; c10::optional sum_conv_bias_op; if (bias_.has_value()) { - // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation; + // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation; // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops // but here, we chose to do it statically. c10::optional::emplace() enables this approach @@ -402,7 +402,7 @@ TORCH_LIBRARY_IMPL(quantized, QuantizedCUDA, m) { m.impl(TORCH_SELECTIVE_NAME("quantized::conv2d_relu.new"), QConvInt8<2, true>::run); } -} // anonyous namespace +} // anonymous namespace } // namespace at::native diff --git a/aten/src/ATen/native/quantized/cudnn/Linear.cpp b/aten/src/ATen/native/quantized/cudnn/Linear.cpp index 37e08ba7861d0..f9333d6fbed7a 100644 --- a/aten/src/ATen/native/quantized/cudnn/Linear.cpp +++ b/aten/src/ATen/native/quantized/cudnn/Linear.cpp @@ -103,7 +103,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp if (bias_.has_value()) { // the input bias is a 1-D tensor whose size is the same as the size of the last dimension of quantized_output // we need to add trailing dimensions in order to properly broadcast bias, otherwise broadcast_to will fail. - // the number of trailling dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity + // the number of trailing dimensions is quantized_output.dim() - 2. We also prepend a leading dimension for clarity std::vector new_size(quantized_output.dim(), 1); new_size.back() = bias_.value().size(0); broadcasted_bias = bias_.value().clone().reshape(new_size); @@ -186,7 +186,7 @@ void PackedLinearWeightCudnn::apply_impl_helper(const at::Tensor& quantized_outp c10::optional bias_mult_op; c10::optional sum_linear_bias_op; if (bias_.has_value()) { - // we can't directly assign bias_mult_op becauase operator= is deleted for cudnn_frontend::Operation; + // we can't directly assign bias_mult_op because operator= is deleted for cudnn_frontend::Operation; // alternatively, I think we can use std::unique_ptr and dynamically allocate these builder ops // but here, we chose to do it statically. c10::optional::emplace() enables this approach diff --git a/aten/src/ATen/native/quantized/library.cpp b/aten/src/ATen/native/quantized/library.cpp index 41cacf20114de..97de2cfbf078a 100644 --- a/aten/src/ATen/native/quantized/library.cpp +++ b/aten/src/ATen/native/quantized/library.cpp @@ -96,7 +96,7 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_dilation(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int[]"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_groups(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv3d_transpose(__torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weights) -> int"), {at::Tag::pt2_compliant_tag}); - // conv_tranpsose + // conv_transpose m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose1d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose2d(Tensor qx, __torch__.torch.classes.quantized.Conv2dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::conv_transpose3d(Tensor qx, __torch__.torch.classes.quantized.Conv3dPackedParamsBase packed_weight, float output_scale, int output_zero_point) -> Tensor"), {at::Tag::pt2_compliant_tag}); @@ -149,6 +149,7 @@ TORCH_LIBRARY(quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, bool reduce_range=False) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); + m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_dynamic_fp16_unpacked_weight(Tensor X, Tensor weight, Tensor bias) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_relu_dynamic_fp16(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_leaky_relu(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i, float negative_slope) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); m.def(TORCH_SELECTIVE_SCHEMA("quantized::linear_tanh(Tensor X, __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack, float Y_scale_i, int Y_zero_point_i) -> Tensor Y"), {at::Tag::pt2_compliant_tag}); @@ -246,6 +247,8 @@ TORCH_LIBRARY(_quantized, m) { m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16(Tensor W, Tensor? B=None) -> __torch__.torch.classes.quantized.LinearPackedParamsBase W_prepack")); m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); m.def(TORCH_SELECTIVE_SCHEMA("_quantized::linear_prepack_fp16_legacy(Tensor W, Tensor? B=None) -> Tensor W_prepack")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_pack_gemm_matrix_fp16(Tensor W) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("_quantized::wrapped_fbgemm_linear_fp16_weight(Tensor X, Tensor W, Tensor B, int out_channel) -> Tensor")); } TORCH_LIBRARY(onednn, m) { @@ -254,16 +257,20 @@ TORCH_LIBRARY(onednn, m) { m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv_prepack(Tensor weight, Tensor w_scales, float x_scale, int x_zp, int[] stride, int[] padding, int[] dilation, int groups, int[]? x_shape=None) -> Tensor")); // Conv1D/2D/3D with unary postop - m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); - m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); - m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv1d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv3d_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str attr, Scalar?[] scalars, str? algorithm) -> Tensor")); // Conv2D with binary postop - m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float inv_output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qconv2d_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qaccum, float accum_scale, int accum_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, int[] stride, int[] padding, int[] dilation, int groups, float output_scale, int output_zero_point, ScalarType? output_dtype, str binary_attr, Scalar? alpha, str? unary_attr, Scalar?[] unary_scalars, str? unary_algorithm) -> Tensor")); // Linear prepack m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_prepack(Tensor weight, int[]? x_shape) -> Tensor")); // Linear with unary postop m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, str post_op_name, Scalar?[] post_op_args, str post_op_algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, str post_op_name, Scalar?[] post_op_args, str post_op_algorithm) -> Tensor")); + // Linear with binary postop + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary(Tensor qx, float x_scale, int x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, Tensor? other, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor")); + m.def(TORCH_SELECTIVE_SCHEMA("onednn::qlinear_pointwise.binary_tensor(Tensor qx, Tensor x_scale, Tensor x_zero_point, Tensor qw, Tensor w_scale, Tensor w_zero_point, Tensor? bias, float output_scale, int output_zero_point, ScalarType? output_dtype, Tensor? other, float other_scale, int other_zp, str binary_post_op, float binary_alpha, str unary_post_op, Scalar?[] unary_post_op_args, str unary_post_op_algorithm) -> Tensor")); } diff --git a/aten/src/ATen/native/quantized/qconv_unpack.cpp b/aten/src/ATen/native/quantized/qconv_unpack.cpp index cff99560b7eec..fe4007c712ce5 100644 --- a/aten/src/ATen/native/quantized/qconv_unpack.cpp +++ b/aten/src/ATen/native/quantized/qconv_unpack.cpp @@ -181,9 +181,7 @@ class QConvTranspose final { IValue unpack_quantized_prepacked_sizes_conv2d(const IValue& ivalue) { auto params = ivalue.toCustomClass>(); - at::Tensor weight; - c10::optional bias; - std::tie(weight, bias) = params->unpack(); + auto [weight, bias] = params->unpack(); at::OptionalIntArrayRef bias_sizes = c10::nullopt; if (bias && bias->defined()) { bias_sizes = bias->sizes(); diff --git a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h index 231da0e911c34..0e79ed809ae6d 100644 --- a/aten/src/ATen/native/sparse/FlattenIndicesCommon.h +++ b/aten/src/ATen/native/sparse/FlattenIndicesCommon.h @@ -62,7 +62,7 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) { .build(); { - const auto* RESTRICT ptr_indices = indices.data_ptr(); + const auto* RESTRICT ptr_indices = indices.const_data_ptr(); KernelLauncher::launch(iter, // NOTE: capture by value required by CUDA @@ -87,7 +87,7 @@ Tensor _flatten_indices_impl(const Tensor& indices, IntArrayRef size) { template