diff --git a/.circleci/cimodel/data/binary_build_data.py b/.circleci/cimodel/data/binary_build_data.py index 21b6eebef5a1..7c81ad2f78ae 100644 --- a/.circleci/cimodel/data/binary_build_data.py +++ b/.circleci/cimodel/data/binary_build_data.py @@ -36,6 +36,7 @@ def get_processor_arch_name(gpu_version): "3.6m", "3.7m", "3.8m", + "3.9m" ], conda=dimensions.STANDARD_PYTHON_VERSIONS, libtorch=[ @@ -43,11 +44,18 @@ def get_processor_arch_name(gpu_version): ], ) +# TODO: There's an issue with current Python 3.9 builds that only occurs during +# windows builds, let's just not build 3.9 for windows and figure out how +# to resolve afterwards +PYTHON_VERSIONS_NO_39 = [ + v for v in dimensions.STANDARD_PYTHON_VERSIONS if v not in ['3.9'] +] + CONFIG_TREE_DATA = OrderedDict( linux=(dimensions.GPU_VERSIONS, LINUX_PACKAGE_VARIANTS), macos=([None], OrderedDict( - wheel=dimensions.STANDARD_PYTHON_VERSIONS, - conda=dimensions.STANDARD_PYTHON_VERSIONS, + wheel=PYTHON_VERSIONS_NO_39, + conda=PYTHON_VERSIONS_NO_39, libtorch=[ "3.7", ], @@ -56,8 +64,8 @@ def get_processor_arch_name(gpu_version): windows=( [v for v in dimensions.GPU_VERSIONS if v not in ['cuda92'] + dimensions.ROCM_VERSION_LABELS], OrderedDict( - wheel=dimensions.STANDARD_PYTHON_VERSIONS, - conda=dimensions.STANDARD_PYTHON_VERSIONS, + wheel=PYTHON_VERSIONS_NO_39, + conda=PYTHON_VERSIONS_NO_39, libtorch=[ "3.7", ], diff --git a/.circleci/cimodel/data/dimensions.py b/.circleci/cimodel/data/dimensions.py index 270c96498d39..57489ebe7915 100644 --- a/.circleci/cimodel/data/dimensions.py +++ b/.circleci/cimodel/data/dimensions.py @@ -20,4 +20,5 @@ "3.6", "3.7", "3.8", + "3.9" ] diff --git a/.circleci/cimodel/data/simple/docker_definitions.py b/.circleci/cimodel/data/simple/docker_definitions.py index ef1d7e8d1a70..91f757207915 100644 --- a/.circleci/cimodel/data/simple/docker_definitions.py +++ b/.circleci/cimodel/data/simple/docker_definitions.py @@ -31,6 +31,7 @@ "pytorch-linux-xenial-py3.6-gcc7", "pytorch-linux-bionic-rocm3.7-py3.6", "pytorch-linux-bionic-rocm3.8-py3.6", + "pytorch-linux-bionic-rocm3.9-py3.6", ] diff --git a/.circleci/config.yml b/.circleci/config.yml index 1d4f7de1faa4..b5144dc703ea 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -1985,6 +1985,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-cuda102" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_build + build_environment: "manywheel 3.9m cpu devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-cuda102" - binary_linux_build: name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_build build_environment: "manywheel 3.6m cu92 devtoolset7" @@ -2018,6 +2029,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-cuda92" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_build + build_environment: "manywheel 3.9m cu92 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-cuda92" - binary_linux_build: name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_build build_environment: "manywheel 3.6m cu101 devtoolset7" @@ -2051,6 +2073,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-cuda101" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_build + build_environment: "manywheel 3.9m cu101 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-cuda101" - binary_linux_build: name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_build build_environment: "manywheel 3.6m cu102 devtoolset7" @@ -2084,6 +2117,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-cuda102" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_build + build_environment: "manywheel 3.9m cu102 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-cuda102" - binary_linux_build: name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_build build_environment: "manywheel 3.6m cu110 devtoolset7" @@ -2117,6 +2161,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-cuda110" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_build + build_environment: "manywheel 3.9m cu110 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-cuda110" - binary_linux_build: name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_build build_environment: "manywheel 3.6m rocm3.8 devtoolset7" @@ -2150,6 +2205,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-rocm:3.8" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build + build_environment: "manywheel 3.9m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.8" - binary_linux_build: name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_build build_environment: "manywheel 3.6m rocm3.9 devtoolset7" @@ -2183,6 +2249,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/manylinux-rocm:3.9" + - binary_linux_build: + name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build + build_environment: "manywheel 3.9m rocm3.9 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/manylinux-rocm:3.9" - binary_linux_build: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_build build_environment: "conda 3.6 cpu devtoolset7" @@ -2216,6 +2293,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/conda-cuda" + - binary_linux_build: + name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_build + build_environment: "conda 3.9 cpu devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/conda-cuda" - binary_linux_build: name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_build build_environment: "conda 3.6 cu92 devtoolset7" @@ -2249,6 +2337,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/conda-cuda" + - binary_linux_build: + name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_build + build_environment: "conda 3.9 cu92 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/conda-cuda" - binary_linux_build: name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_build build_environment: "conda 3.6 cu101 devtoolset7" @@ -2282,6 +2381,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/conda-cuda" + - binary_linux_build: + name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_build + build_environment: "conda 3.9 cu101 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/conda-cuda" - binary_linux_build: name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_build build_environment: "conda 3.6 cu102 devtoolset7" @@ -2315,6 +2425,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/conda-cuda" + - binary_linux_build: + name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_build + build_environment: "conda 3.9 cu102 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/conda-cuda" - binary_linux_build: name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_build build_environment: "conda 3.6 cu110 devtoolset7" @@ -2348,6 +2469,17 @@ workflows: only: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ docker_image: "pytorch/conda-cuda" + - binary_linux_build: + name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_build + build_environment: "conda 3.9 cu110 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + docker_image: "pytorch/conda-cuda" - binary_linux_build: name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_build build_environment: "libtorch 3.7m cpu devtoolset7" @@ -3257,6 +3389,19 @@ workflows: requires: - binary_linux_manywheel_3_8m_cpu_devtoolset7_nightly_build docker_image: "pytorch/manylinux-cuda102" + - binary_linux_test: + name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_test + build_environment: "manywheel 3.9m cpu devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-cuda102" - binary_linux_test: name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_test build_environment: "manywheel 3.6m cu92 devtoolset7" @@ -3302,6 +3447,21 @@ workflows: docker_image: "pytorch/manylinux-cuda92" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_test + build_environment: "manywheel 3.9m cu92 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-cuda92" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_test build_environment: "manywheel 3.6m cu101 devtoolset7" @@ -3347,6 +3507,21 @@ workflows: docker_image: "pytorch/manylinux-cuda101" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_test + build_environment: "manywheel 3.9m cu101 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-cuda101" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_test build_environment: "manywheel 3.6m cu102 devtoolset7" @@ -3392,6 +3567,21 @@ workflows: docker_image: "pytorch/manylinux-cuda102" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_test + build_environment: "manywheel 3.9m cu102 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-cuda102" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_test build_environment: "manywheel 3.6m cu110 devtoolset7" @@ -3437,6 +3627,21 @@ workflows: docker_image: "pytorch/manylinux-cuda110" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_test + build_environment: "manywheel 3.9m cu110 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-cuda110" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_test build_environment: "manywheel 3.6m rocm3.8 devtoolset7" @@ -3482,6 +3687,21 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.8" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test + build_environment: "manywheel 3.9m rocm3.8 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_test build_environment: "manywheel 3.6m rocm3.9 devtoolset7" @@ -3527,6 +3747,21 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.9" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test + build_environment: "manywheel 3.9m rocm3.9 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_build + docker_image: "pytorch/manylinux-rocm:3.9" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_test build_environment: "conda 3.6 cpu devtoolset7" @@ -3566,6 +3801,19 @@ workflows: requires: - binary_linux_conda_3_8_cpu_devtoolset7_nightly_build docker_image: "pytorch/conda-cuda" + - binary_linux_test: + name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_test + build_environment: "conda 3.9 cpu devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_conda_3_9_cpu_devtoolset7_nightly_build + docker_image: "pytorch/conda-cuda" - binary_linux_test: name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_test build_environment: "conda 3.6 cu92 devtoolset7" @@ -3611,6 +3859,21 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_test + build_environment: "conda 3.9 cu92 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_conda_3_9_cu92_devtoolset7_nightly_build + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_test build_environment: "conda 3.6 cu101 devtoolset7" @@ -3656,6 +3919,21 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_test + build_environment: "conda 3.9 cu101 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_conda_3_9_cu101_devtoolset7_nightly_build + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_test build_environment: "conda 3.6 cu102 devtoolset7" @@ -3701,6 +3979,21 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_test + build_environment: "conda 3.9 cu102 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_conda_3_9_cu102_devtoolset7_nightly_build + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_test build_environment: "conda 3.6 cu110 devtoolset7" @@ -3746,6 +4039,21 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - binary_linux_test: + name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_test + build_environment: "conda 3.9 cu110 devtoolset7" + filters: + branches: + only: + - /.*/ + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + requires: + - binary_linux_conda_3_9_cu110_devtoolset7_nightly_build + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - binary_linux_test: name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_test build_environment: "libtorch 3.7m cpu devtoolset7" @@ -4820,6 +5128,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: cpu + - binary_upload: + name: binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_cpu_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: cpu - binary_upload: name: binary_linux_manywheel_3_6m_cu92_devtoolset7_nightly_upload context: org-member @@ -4862,6 +5184,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: cu92 + - binary_upload: + name: binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_cu92_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: cu92 - binary_upload: name: binary_linux_manywheel_3_6m_cu101_devtoolset7_nightly_upload context: org-member @@ -4904,6 +5240,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: cu101 + - binary_upload: + name: binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_cu101_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: cu101 - binary_upload: name: binary_linux_manywheel_3_6m_cu102_devtoolset7_nightly_upload context: org-member @@ -4946,6 +5296,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: cu102 + - binary_upload: + name: binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_cu102_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: cu102 - binary_upload: name: binary_linux_manywheel_3_6m_cu110_devtoolset7_nightly_upload context: org-member @@ -4988,6 +5352,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: cu110 + - binary_upload: + name: binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_cu110_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: cu110 - binary_upload: name: binary_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly_upload context: org-member @@ -5030,6 +5408,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: rocm3.8 + - binary_upload: + name: binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.8 - binary_upload: name: binary_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly_upload context: org-member @@ -5072,6 +5464,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: manywheel upload_subfolder: rocm3.9 + - binary_upload: + name: binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: manywheel + upload_subfolder: rocm3.9 - binary_upload: name: binary_linux_conda_3_6_cpu_devtoolset7_nightly_upload context: org-member @@ -5114,6 +5520,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: conda upload_subfolder: cpu + - binary_upload: + name: binary_linux_conda_3_9_cpu_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_conda_3_9_cpu_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: conda + upload_subfolder: cpu - binary_upload: name: binary_linux_conda_3_6_cu92_devtoolset7_nightly_upload context: org-member @@ -5156,6 +5576,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: conda upload_subfolder: cu92 + - binary_upload: + name: binary_linux_conda_3_9_cu92_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_conda_3_9_cu92_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: conda + upload_subfolder: cu92 - binary_upload: name: binary_linux_conda_3_6_cu101_devtoolset7_nightly_upload context: org-member @@ -5198,6 +5632,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: conda upload_subfolder: cu101 + - binary_upload: + name: binary_linux_conda_3_9_cu101_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_conda_3_9_cu101_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: conda + upload_subfolder: cu101 - binary_upload: name: binary_linux_conda_3_6_cu102_devtoolset7_nightly_upload context: org-member @@ -5240,6 +5688,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: conda upload_subfolder: cu102 + - binary_upload: + name: binary_linux_conda_3_9_cu102_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_conda_3_9_cu102_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: conda + upload_subfolder: cu102 - binary_upload: name: binary_linux_conda_3_6_cu110_devtoolset7_nightly_upload context: org-member @@ -5282,6 +5744,20 @@ workflows: - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ package_type: conda upload_subfolder: cu110 + - binary_upload: + name: binary_linux_conda_3_9_cu110_devtoolset7_nightly_upload + context: org-member + requires: + - binary_linux_conda_3_9_cu110_devtoolset7_nightly_test + filters: + branches: + only: + - nightly + tags: + only: + - /v[0-9]+(\.[0-9]+)*-rc[0-9]+/ + package_type: conda + upload_subfolder: cu110 - binary_upload: name: binary_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps_upload context: org-member @@ -6471,6 +6947,9 @@ workflows: - docker_build_job: name: "docker-pytorch-linux-bionic-rocm3.8-py3.6" image_name: "pytorch-linux-bionic-rocm3.8-py3.6" + - docker_build_job: + name: "docker-pytorch-linux-bionic-rocm3.9-py3.6" + image_name: "pytorch-linux-bionic-rocm3.9-py3.6" - pytorch_linux_build: name: pytorch_linux_xenial_py3_6_gcc5_4_build requires: @@ -7450,6 +7929,16 @@ workflows: only: - postnightly docker_image: "pytorch/manylinux-cuda102" + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_cpu_devtoolset7_nightly + build_environment: "manywheel 3.9m cpu devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-cuda102" - smoke_linux_test: name: smoke_linux_manywheel_3_6m_cu92_devtoolset7_nightly build_environment: "manywheel 3.6m cu92 devtoolset7" @@ -7486,6 +7975,18 @@ workflows: docker_image: "pytorch/manylinux-cuda92" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_cu92_devtoolset7_nightly + build_environment: "manywheel 3.9m cu92 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-cuda92" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_manywheel_3_6m_cu101_devtoolset7_nightly build_environment: "manywheel 3.6m cu101 devtoolset7" @@ -7522,6 +8023,18 @@ workflows: docker_image: "pytorch/manylinux-cuda101" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_cu101_devtoolset7_nightly + build_environment: "manywheel 3.9m cu101 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-cuda101" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_manywheel_3_6m_cu102_devtoolset7_nightly build_environment: "manywheel 3.6m cu102 devtoolset7" @@ -7558,6 +8071,18 @@ workflows: docker_image: "pytorch/manylinux-cuda102" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_cu102_devtoolset7_nightly + build_environment: "manywheel 3.9m cu102 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-cuda102" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_manywheel_3_6m_cu110_devtoolset7_nightly build_environment: "manywheel 3.6m cu110 devtoolset7" @@ -7594,6 +8119,18 @@ workflows: docker_image: "pytorch/manylinux-cuda110" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_cu110_devtoolset7_nightly + build_environment: "manywheel 3.9m cu110 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-cuda110" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_manywheel_3_6m_rocm3_8_devtoolset7_nightly build_environment: "manywheel 3.6m rocm3.8 devtoolset7" @@ -7630,6 +8167,18 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.8" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_rocm3_8_devtoolset7_nightly + build_environment: "manywheel 3.9m rocm3.8 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.8" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_manywheel_3_6m_rocm3_9_devtoolset7_nightly build_environment: "manywheel 3.6m rocm3.9 devtoolset7" @@ -7666,6 +8215,18 @@ workflows: docker_image: "pytorch/manylinux-rocm:3.9" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_manywheel_3_9m_rocm3_9_devtoolset7_nightly + build_environment: "manywheel 3.9m rocm3.9 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/manylinux-rocm:3.9" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cpu_devtoolset7_nightly build_environment: "conda 3.6 cpu devtoolset7" @@ -7696,6 +8257,16 @@ workflows: only: - postnightly docker_image: "pytorch/conda-cuda" + - smoke_linux_test: + name: smoke_linux_conda_3_9_cpu_devtoolset7_nightly + build_environment: "conda 3.9 cpu devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/conda-cuda" - smoke_linux_test: name: smoke_linux_conda_3_6_cu92_devtoolset7_nightly build_environment: "conda 3.6 cu92 devtoolset7" @@ -7732,6 +8303,18 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_conda_3_9_cu92_devtoolset7_nightly + build_environment: "conda 3.9 cu92 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cu101_devtoolset7_nightly build_environment: "conda 3.6 cu101 devtoolset7" @@ -7768,6 +8351,18 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_conda_3_9_cu101_devtoolset7_nightly + build_environment: "conda 3.9 cu101 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cu102_devtoolset7_nightly build_environment: "conda 3.6 cu102 devtoolset7" @@ -7804,6 +8399,18 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_conda_3_9_cu102_devtoolset7_nightly + build_environment: "conda 3.9 cu102 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_conda_3_6_cu110_devtoolset7_nightly build_environment: "conda 3.6 cu110 devtoolset7" @@ -7840,6 +8447,18 @@ workflows: docker_image: "pytorch/conda-cuda" use_cuda_docker_runtime: "1" resource_class: gpu.medium + - smoke_linux_test: + name: smoke_linux_conda_3_9_cu110_devtoolset7_nightly + build_environment: "conda 3.9 cu110 devtoolset7" + requires: + - update_s3_htmls + filters: + branches: + only: + - postnightly + docker_image: "pytorch/conda-cuda" + use_cuda_docker_runtime: "1" + resource_class: gpu.medium - smoke_linux_test: name: smoke_linux_libtorch_3_7m_cpu_devtoolset7_nightly_shared-with-deps build_environment: "libtorch 3.7m cpu devtoolset7" diff --git a/.circleci/docker/build.sh b/.circleci/docker/build.sh index 7353bc18c221..019c7f6e9d1c 100755 --- a/.circleci/docker/build.sh +++ b/.circleci/docker/build.sh @@ -288,6 +288,13 @@ case "$image" in VISION=yes ROCM_VERSION=3.8 ;; + pytorch-linux-bionic-rocm3.9-py3.6) + ANACONDA_PYTHON_VERSION=3.6 + PROTOBUF=yes + DB=yes + VISION=yes + ROCM_VERSION=3.9 + ;; *) # Catch-all for builds that are not hardcoded. PROTOBUF=yes diff --git a/.circleci/docker/common/install_conda.sh b/.circleci/docker/common/install_conda.sh index c63e28029f07..b7ad26f44836 100755 --- a/.circleci/docker/common/install_conda.sh +++ b/.circleci/docker/common/install_conda.sh @@ -98,11 +98,28 @@ if [ -n "$ANACONDA_PYTHON_VERSION" ]; then # Install some other packages, including those needed for Python test reporting # TODO: Why is scipy pinned - # numba & llvmlite is pinned because of https://github.com/numba/numba/issues/4368 - # scikit-learn is pinned because of - # https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 - # only) - as_jenkins pip install --progress-bar off pytest scipy==1.1.0 scikit-learn==0.20.3 scikit-image librosa>=0.6.2 psutil numba==0.46.0 llvmlite==0.30.0 unittest-xml-reporting coverage + # Pin MyPy version because new errors are likely to appear with each release + # Pin hypothesis to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 + as_jenkins pip install --progress-bar off pytest \ + scipy==1.1.0 \ + scikit-image \ + librosa>=0.6.2 \ + psutil \ + numba \ + llvmlite \ + unittest-xml-reporting \ + coverage \ + hypothesis==4.53.2 \ + mypy==0.770 \ + tb-nightly + + # Update scikit-learn to a python-3.8 compatible version + if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then + as_jenkins pip install --progress-bar off -U scikit-learn + else + # Pinned scikit-learn due to https://github.com/scikit-learn/scikit-learn/issues/14485 (affects gcc 5.5 only) + as_jenkins pip install --progress-bar off scikit-learn==0.20.3 + fi popd fi diff --git a/.circleci/scripts/binary_linux_test.sh b/.circleci/scripts/binary_linux_test.sh index b0d7385d07ee..6be3a0ddefc7 100755 --- a/.circleci/scripts/binary_linux_test.sh +++ b/.circleci/scripts/binary_linux_test.sh @@ -5,12 +5,13 @@ cat >/home/circleci/project/ci_test_script.sh </dev/null elif [[ "$PACKAGE_TYPE" != libtorch ]]; then - python_nodot="\$(echo $DESIRED_PYTHON | tr -d m.u)" python_path="/opt/python/cp\$python_nodot-cp\${python_nodot}" # Prior to Python 3.8 paths were suffixed with an 'm' if [[ -d "\${python_path}/bin" ]]; then @@ -20,6 +21,11 @@ elif [[ "$PACKAGE_TYPE" != libtorch ]]; then fi fi +EXTRA_CONDA_FLAGS="" +if [[ "\$python_nodot" = *39* ]]; then + EXTRA_CONDA_FLAGS="-c=conda-forge" +fi + # Install the package # These network calls should not have 'retry's because they are installing # locally and aren't actually network calls @@ -28,11 +34,11 @@ fi # conda build scripts themselves. These should really be consolidated pkg="/final_pkgs/\$(ls /final_pkgs)" if [[ "$PACKAGE_TYPE" == conda ]]; then - conda install -y "\$pkg" --offline + conda install \${EXTRA_CONDA_FLAGS} -y "\$pkg" --offline if [[ "$DESIRED_CUDA" == 'cpu' ]]; then - retry conda install -y cpuonly -c pytorch + retry conda install \${EXTRA_CONDA_FLAGS} -y cpuonly -c pytorch fi - retry conda install -yq future numpy protobuf six + retry conda install \${EXTRA_CONDA_FLAGS} -yq future numpy protobuf six if [[ "$DESIRED_CUDA" != 'cpu' ]]; then # DESIRED_CUDA is in format cu90 or cu102 if [[ "${#DESIRED_CUDA}" == 4 ]]; then @@ -40,7 +46,7 @@ if [[ "$PACKAGE_TYPE" == conda ]]; then else cu_ver="${DESIRED_CUDA:2:2}.${DESIRED_CUDA:4}" fi - retry conda install -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}" + retry conda install \${EXTRA_CONDA_FLAGS} -yq -c nvidia -c pytorch "cudatoolkit=\${cu_ver}" fi elif [[ "$PACKAGE_TYPE" != libtorch ]]; then pip install "\$pkg" diff --git a/.gitignore b/.gitignore index e908b405a662..3d2e85be977f 100644 --- a/.gitignore +++ b/.gitignore @@ -33,6 +33,8 @@ docs/cpp/src docs/src/**/* docs/cpp/build docs/cpp/source/api +docs/cpp/source/html/ +docs/cpp/source/latex/ docs/source/generated/ log test/.coverage diff --git a/.jenkins/caffe2/build.sh b/.jenkins/caffe2/build.sh index bba8aa0e0365..3efaf448e4ab 100755 --- a/.jenkins/caffe2/build.sh +++ b/.jenkins/caffe2/build.sh @@ -161,6 +161,11 @@ if [[ $BUILD_ENVIRONMENT == *cuda* ]]; then export PATH="/usr/local/cuda/bin:$PATH" fi if [[ $BUILD_ENVIRONMENT == *rocm* ]]; then + if [[ -n "$IN_CI" ]]; then + # Set ROCM_ARCH to gfx900 and gfx906 for CI builds + echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds" + export PYTORCH_ROCM_ARCH="gfx900;gfx906" + fi # This is needed to enable ImageInput operator in resnet50_trainer build_args+=("USE_OPENCV=ON") # This is needed to read datasets from https://download.caffe2.ai/databases/resnet_trainer.zip diff --git a/.jenkins/caffe2/test.sh b/.jenkins/caffe2/test.sh index 9656ec338fe7..9fd031f49907 100755 --- a/.jenkins/caffe2/test.sh +++ b/.jenkins/caffe2/test.sh @@ -88,20 +88,16 @@ if [[ "$PIP_USER" = root ]]; then MAYBE_SUDO=sudo fi -# if [[ "$BUILD_ENVIRONMENT" == *ubuntu14.04* ]]; then - # Hotfix, use hypothesis 3.44.6 on Ubuntu 14.04 - # See comments on - # https://github.com/HypothesisWorks/hypothesis-python/commit/eadd62e467d6cee6216e71b391951ec25b4f5830 - $MAYBE_SUDO pip -q uninstall -y hypothesis - $MAYBE_SUDO pip -q uninstall -y coverage - # "pip install hypothesis==3.44.6" from official server is unreliable on - # CircleCI, so we host a copy on S3 instead - $MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl - $MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl - $MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl -# else -# pip install --user --no-cache-dir hypothesis==3.59.0 -# fi +# Uninstall pre-installed hypothesis and coverage to use an older version as newer +# versions remove the timeout parameter from settings which ideep/conv_transpose_test.py uses +$MAYBE_SUDO pip -q uninstall -y hypothesis +$MAYBE_SUDO pip -q uninstall -y coverage + +# "pip install hypothesis==3.44.6" from official server is unreliable on +# CircleCI, so we host a copy on S3 instead +$MAYBE_SUDO pip -q install attrs==18.1.0 -f https://s3.amazonaws.com/ossci-linux/wheels/attrs-18.1.0-py2.py3-none-any.whl +$MAYBE_SUDO pip -q install coverage==4.5.1 -f https://s3.amazonaws.com/ossci-linux/wheels/coverage-4.5.1-cp36-cp36m-macosx_10_12_x86_64.whl +$MAYBE_SUDO pip -q install hypothesis==3.44.6 -f https://s3.amazonaws.com/ossci-linux/wheels/hypothesis-3.44.6-py3-none-any.whl # Collect additional tests to run (outside caffe2/python) EXTRA_TESTS=() diff --git a/.jenkins/pytorch/build.sh b/.jenkins/pytorch/build.sh index b94e797e7010..b6e21c363133 100755 --- a/.jenkins/pytorch/build.sh +++ b/.jenkins/pytorch/build.sh @@ -151,8 +151,8 @@ if [[ "$BUILD_ENVIRONMENT" == *rocm* ]]; then fi if [[ -n "$IN_CI" ]]; then - # Set ROCM_ARCH to gtx900 and gtx906 in CircleCI - echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CircleCI builds" + # Set ROCM_ARCH to gfx900 and gfx906 for CI builds + echo "Limiting PYTORCH_ROCM_ARCH to gfx90[06] for CI builds" export PYTORCH_ROCM_ARCH="gfx900;gfx906" fi diff --git a/.jenkins/pytorch/common.sh b/.jenkins/pytorch/common.sh index 7747324676b5..96e3f5d8ede1 100644 --- a/.jenkins/pytorch/common.sh +++ b/.jenkins/pytorch/common.sh @@ -140,4 +140,3 @@ fi retry () { $* || (sleep 1 && $*) || (sleep 2 && $*) } - diff --git a/.jenkins/pytorch/multigpu-test.sh b/.jenkins/pytorch/multigpu-test.sh index fb5e6f54d013..c61da6af707f 100755 --- a/.jenkins/pytorch/multigpu-test.sh +++ b/.jenkins/pytorch/multigpu-test.sh @@ -17,7 +17,6 @@ fi python tools/download_mnist.py --quiet -d test/cpp/api/mnist OMP_NUM_THREADS=2 TORCH_CPP_TEST_MNIST_PATH="test/cpp/api/mnist" build/bin/test_api -time python test/run_test.py --verbose -i distributed/test_distributed_fork time python test/run_test.py --verbose -i distributed/test_c10d time python test/run_test.py --verbose -i distributed/test_c10d_spawn assert_git_not_dirty diff --git a/.jenkins/pytorch/test.sh b/.jenkins/pytorch/test.sh index 88bcfc93e19d..78ba67c088ee 100755 --- a/.jenkins/pytorch/test.sh +++ b/.jenkins/pytorch/test.sh @@ -32,18 +32,6 @@ if [[ "$BUILD_ENVIRONMENT" != *ppc64le* ]] && [[ "$BUILD_ENVIRONMENT" != *-bazel # ninja is installed in $HOME/.local/bin, e.g., /var/lib/jenkins/.local/bin for CI user jenkins # but this script should be runnable by any user, including root export PATH="$HOME/.local/bin:$PATH" - - # TODO: Please move this to Docker - # The version is fixed to avoid flakiness: https://github.com/pytorch/pytorch/issues/31136 - pip_install --user "hypothesis==4.53.2" - # Pin MyPy version because new errors are likely to appear with each release - pip_install --user "mypy==0.770" - # Update scikit-learn to a python-3.8 compatible version - if [[ $(python -c "import sys; print(int(sys.version_info >= (3, 8)))") == "1" ]]; then - pip_install -U scikit-learn - fi - - pip_install --user tb-nightly fi # DANGER WILL ROBINSON. The LD_PRELOAD here could cause you problems diff --git a/CMakeLists.txt b/CMakeLists.txt index 0bac4993d1b6..2a0dd1c75974 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -722,6 +722,8 @@ endif() if(ANDROID AND (NOT ANDROID_DEBUG_SYMBOLS)) if(CMAKE_COMPILER_IS_GNUCXX) string(APPEND CMAKE_CXX_FLAGS " -s") + elseif("${CMAKE_CXX_COMPILER_ID}" MATCHES "Clang") + string(APPEND CMAKE_CXX_FLAGS " -g0") else() string(APPEND CMAKE_EXE_LINKER_FLAGS " -s") endif() diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index a1b4096592a7..6593e35e4cf9 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -313,6 +313,8 @@ pip install -r requirements.txt # npm install -g katex # Or if you prefer an uncontaminated global executable environment or do not want to go through the node configuration: # npm install katex && export PATH="$PATH:$(pwd)/node_modules/.bin" +# If you're a Facebook employee using a devserver, yarn may be more convenient: +# yarn global add katex ``` 3. Generate the documentation HTML files. The generated files will be in `docs/build/html`. @@ -353,7 +355,7 @@ information on the documentation syntax. We run Doxygen in CI (Travis) to verify that you do not use invalid Doxygen commands. To run this check locally, run `./check-doxygen.sh` from inside -`docs/cpp`. +`docs/cpp/source`. To build the documentation, follow the same steps as above, but run them from `docs/cpp` instead of `docs`. @@ -378,6 +380,14 @@ et my_machine -t="8000:8000" Then navigate to `localhost:8000` in your web browser. +Alternatively, you can run `rsync` on your local machine to copy the files from +your remote machine: +```bash +mkdir -p build cpp/build +rsync -az me@my_machine:/path/to/pytorch/docs/build/html build +rsync -az me@my_machine:/path/to/pytorch/docs/cpp/build/html cpp/build +``` + #### Submitting changes for review It is helpful when submitting a PR that changes the docs to provide a rendered diff --git a/aten/src/ATen/core/boxing/KernelFunction_test.cpp b/aten/src/ATen/core/boxing/KernelFunction_test.cpp index 87517afe27c6..8ba50db14a2b 100644 --- a/aten/src/ATen/core/boxing/KernelFunction_test.cpp +++ b/aten/src/ATen/core/boxing/KernelFunction_test.cpp @@ -110,50 +110,92 @@ OperatorHandle makeDummyOperatorHandle() { // boxed kernels that return refs to tensor arguments, a la inplace/outplace kernels // -void boxed_func_with_tensor_ref_return(const OperatorHandle& /*opHandle*/, Stack* stack) { +void boxed_func_for_inplace_op(const OperatorHandle& /*opHandle*/, Stack* stack) { // (Tensor(a!), Scalar) -> Tensor(a!) EXPECT_EQ(2, stack->size()); ASSERT_TRUE(stack->at(0).isTensor()); - auto a = stack->at(0).toTensor(); + auto t = stack->at(0).toTensor(); ASSERT_TRUE(stack->at(1).isScalar()); - auto b = stack->at(1).toScalar(); + auto s = stack->at(1).toScalar(); - a.add_(b); + t.add_(s); stack->clear(); - torch::jit::push(stack, a); + torch::jit::push(stack, t); } -void boxed_func_with_multiple_tensor_ref_return(const OperatorHandle& /*opHandle*/, Stack* stack) { +void boxed_func_for_outofplace_op(const OperatorHandle& /*opHandle*/, Stack* stack) { + // (Scalar, Tensor(a!)) -> Tensor(a!) + EXPECT_EQ(2, stack->size()); + + ASSERT_TRUE(stack->at(0).isScalar()); + auto s = stack->at(0).toScalar(); + + ASSERT_TRUE(stack->at(1).isTensor()); + auto t = stack->at(1).toTensor(); + + t.add_(s); + + stack->clear(); + torch::jit::push(stack, t); +} + +void boxed_func_for_outofplace_multi_op(const OperatorHandle& /*opHandle*/, Stack* stack) { // (Tensor(a!), Tensor(b!), Scalar, Scalar) -> (Tensor(a!), Tensor(b!)) EXPECT_EQ(4, stack->size()); ASSERT_TRUE(stack->at(0).isTensor()); - auto a = stack->at(0).toTensor(); + auto t1 = stack->at(0).toTensor(); ASSERT_TRUE(stack->at(1).isTensor()); - auto b = stack->at(1).toTensor(); + auto t2 = stack->at(1).toTensor(); ASSERT_TRUE(stack->at(2).isScalar()); - auto c = stack->at(2).toScalar(); + auto s1 = stack->at(2).toScalar(); ASSERT_TRUE(stack->at(3).isScalar()); - auto d = stack->at(3).toScalar(); + auto s2 = stack->at(3).toScalar(); + + t1.add_(s1); + t2.add_(s2); + + stack->clear(); + torch::jit::push(stack, t1); + torch::jit::push(stack, t2); +} + +void boxed_func_for_legacy_outofplace_multi_op(const OperatorHandle& /*opHandle*/, Stack* stack) { + // (Scalar, Scalar, Tensor(a!), Tensor(b!)) -> (Tensor(a!), Tensor(b!)) + EXPECT_EQ(4, stack->size()); + + ASSERT_TRUE(stack->at(0).isScalar()); + auto s1 = stack->at(0).toScalar(); + + ASSERT_TRUE(stack->at(1).isScalar()); + auto s2 = stack->at(1).toScalar(); + + ASSERT_TRUE(stack->at(2).isTensor()); + auto t1 = stack->at(2).toTensor(); + + ASSERT_TRUE(stack->at(3).isTensor()); + auto t2 = stack->at(3).toTensor(); - a.add_(c); - b.add_(d); + t1.add_(s1); + t2.add_(s2); stack->clear(); - torch::jit::push(stack, a); - torch::jit::push(stack, b); + torch::jit::push(stack, t1); + torch::jit::push(stack, t2); } // // boxed calling tests: // +// functional + void expectBoxedCallingWithReturnWorks(const KernelFunction& func) { called_with_args = c10::nullopt; vector stack {3, 4}; @@ -198,50 +240,76 @@ void expectBoxedCallingWithMultiReturnWorks(const KernelFunction& func) { EXPECT_EQ(12, stack[1].toInt()); } -void expectBoxedCallingWithTensorRefReturnWorks(const KernelFunction& func) { - OperatorHandle dummy = makeDummyOperatorHandle(); +// in/out - auto a = at::zeros({1}); - auto b = 1.0f; - vector stack {a, b}; +void expectInPlaceBoxedCallingWorks(const KernelFunction& func) { + OperatorHandle dummy = makeDummyOperatorHandle(); + auto t = at::zeros({1}); + auto s = 1.0f; + vector stack {t, s}; func.callBoxed(dummy, &stack); - // kernel should have updated arg 0 - EXPECT_EQ(a.item().toFloat(), 1.0f); - - // and returned it on the stack + // kernel should have updated out arg and returned it + EXPECT_EQ(t.item().toFloat(), 1.0f); EXPECT_EQ(1, stack.size()); EXPECT_TRUE(stack[0].isTensor()); - auto t = stack[0].toTensor(); - EXPECT_EQ(t.item().toFloat(), 1.0f); + EXPECT_TRUE(stack[0].toTensor().is_same(t)); } -void expectBoxedCallingWithMultipleTensorRefReturnWorks(const KernelFunction& func) { +void expectOutOfPlaceBoxedCallingWorks(const KernelFunction& func) { OperatorHandle dummy = makeDummyOperatorHandle(); - auto a = at::zeros({1}); - auto b = at::zeros({1}); - auto c = 1.0f; - auto d = 2.0f; - vector stack {a, b, c, d}; - + auto s = 1.0f; + auto t = at::zeros({1}); + vector stack {s, t}; func.callBoxed(dummy, &stack); - // kernel should have updated args 0 and 1 - EXPECT_EQ(a.item().toFloat(), 1.0f); - EXPECT_EQ(b.item().toFloat(), 2.0f); + // kernel should have updated out arg and returned it on the stack + EXPECT_EQ(t.item().toFloat(), 1.0f); + EXPECT_EQ(1, stack.size()); + EXPECT_TRUE(stack[0].isTensor()); + EXPECT_TRUE(stack[0].toTensor().is_same(t)); +} - // and pushed them onto the stack - EXPECT_EQ(2, stack.size()); +void expectOutOfPlaceMultiBoxedCallingWorks(const KernelFunction& func) { + OperatorHandle dummy = makeDummyOperatorHandle(); + + auto t1 = at::zeros({1}); + auto t2 = at::zeros({1}); + auto s1 = 1.0f; + auto s2 = 2.0f; + vector stack {t1, t2, s1, s2}; + func.callBoxed(dummy, &stack); + // kernel should have updated output args and returned them on the stack + EXPECT_EQ(t1.item().toFloat(), 1.0f); + EXPECT_EQ(t2.item().toFloat(), 2.0f); + EXPECT_EQ(2, stack.size()); EXPECT_TRUE(stack[0].isTensor()); - auto ta = stack[0].toTensor(); - EXPECT_EQ(ta.item().toFloat(), 1.0f); + EXPECT_TRUE(stack[0].toTensor().is_same(t1)); + EXPECT_TRUE(stack[1].isTensor()); + EXPECT_TRUE(stack[1].toTensor().is_same(t2)); +} +void expectLegacyOutOfPlaceMultiBoxedCallingWorks(const KernelFunction& func) { + OperatorHandle dummy = makeDummyOperatorHandle(); + + auto s1 = 1.0f; + auto s2 = 2.0f; + auto t1 = at::zeros({1}); + auto t2 = at::zeros({1}); + vector stack {s1, s2, t1, t2}; + func.callBoxed(dummy, &stack); + + // kernel should have updated output args and returned them on the stack + EXPECT_EQ(t1.item().toFloat(), 1.0f); + EXPECT_EQ(t2.item().toFloat(), 2.0f); + EXPECT_EQ(2, stack.size()); + EXPECT_TRUE(stack[0].isTensor()); + EXPECT_TRUE(stack[0].toTensor().is_same(t1)); EXPECT_TRUE(stack[1].isTensor()); - auto tb = stack[1].toTensor(); - EXPECT_EQ(tb.item().toFloat(), 2.0f); + EXPECT_TRUE(stack[1].toTensor().is_same(t2)); } void expectBoxedCallingFailsWith(const KernelFunction& func, const char* errorMessage) { @@ -254,6 +322,12 @@ void expectBoxedCallingFailsWith(const KernelFunction& func, const char* errorMe }, errorMessage); } +// +// unboxed calling tests: +// + +// functional + // make an unboxed call to a kernel that returns a single value. // void expectUnboxedCallingWithReturnWorks(const KernelFunction& func) { @@ -294,57 +368,84 @@ void expectUnboxedCallingWithMultiReturnWorks(const KernelFunction& func) { EXPECT_EQ((tuple(7, 12)), result); } -// make an unboxed call to a kernel that modifies its first (Tensor) argument -// and returns a reference to it. -// -void expectUnboxedCallingWithTensorRefReturnWorks(const KernelFunction& func) { +// in/out + +void expectInPlaceUnboxedCallingWorks(const KernelFunction& func) { OperatorHandle dummy = makeDummyOperatorHandle(); - auto a = at::zeros({1}); + auto t = at::zeros({1}); + at::Tensor& t_out = func.call(dummy, t, 1.0f); - at::Tensor& t = func.call(dummy, a, 1.0f); + // should have updated first arg and returned it + EXPECT_EQ(t.item().toFloat(), 1.0f); + EXPECT_EQ(&t, &t_out); +} + +void expectOutOfPlaceUnboxedCallingWorks(const KernelFunction& func) { + OperatorHandle dummy = makeDummyOperatorHandle(); - EXPECT_EQ(a.item().toFloat(), 1.0f); + auto t = at::zeros({1}); + at::Tensor& t_out = func.call(dummy, 1.0f, t); + + // should have updated out arg and returned it EXPECT_EQ(t.item().toFloat(), 1.0f); + EXPECT_EQ(&t, &t_out); +} + +void expectOutOfPlaceMultiUnboxedCallingWorks(const KernelFunction& func) { + OperatorHandle dummy = makeDummyOperatorHandle(); - EXPECT_EQ(&a, &t); + auto t1 = at::zeros({1}); + auto t2 = at::zeros({1}); + auto s1 = 1.0f; + auto s2 = 2.0f; + + std::tuple tup = func.call< + std::tuple, at::Tensor&, at::Tensor&, at::Scalar, at::Scalar + >(dummy, t1, t2, s1, s2); + + // kernel should have updated out args and returned them in a tuple + EXPECT_EQ(t1.item().toFloat(), 1.0f); + EXPECT_EQ(t2.item().toFloat(), 2.0f); + + auto t1_out = std::get<0>(tup); + EXPECT_EQ(t1_out.item().toFloat(), 1.0f); + EXPECT_TRUE(t1_out.is_same(t1)); + + auto t2_out = std::get<1>(tup); + EXPECT_EQ(t2_out.item().toFloat(), 2.0f); + EXPECT_TRUE(t2_out.is_same(t2)); } -// make an unboxed call to a kernel that modifies its first two (Tensor) arguments -// and returns them. When calling unboxed, these are returned as a tuple. -// -void expectUnboxedCallingWithMultipleTensorRefReturnWorks(const KernelFunction& func) { +void expectLegacyOutOfPlaceMultiUnboxedCallingWorks(const KernelFunction& func) { OperatorHandle dummy = makeDummyOperatorHandle(); - auto a = at::zeros({1}); - auto b = at::zeros({1}); - auto c = 1.0f; - auto d = 2.0f; + auto s1 = 1.0f; + auto s2 = 2.0f; + auto t1 = at::zeros({1}); + auto t2 = at::zeros({1}); std::tuple tup = func.call< - std::tuple, - at::Tensor&, - at::Tensor&, - at::Scalar, - at::Scalar - >(dummy, a, b, c, d); + std::tuple, at::Scalar, at::Scalar, at::Tensor&, at::Tensor& + >(dummy, s1, s2, t1, t2); - // kernel should have updated args 0 and 1 - EXPECT_EQ(a.item().toFloat(), 1.0f); - EXPECT_EQ(b.item().toFloat(), 2.0f); + // kernel should have updated out args and returned them in a tuple + EXPECT_EQ(t1.item().toFloat(), 1.0f); + EXPECT_EQ(t2.item().toFloat(), 2.0f); - // and returned a tuple containing them - auto ta = std::get<0>(tup); - EXPECT_EQ(ta.item().toFloat(), 1.0f); - EXPECT_TRUE(a.is_same(ta)); + auto t1_out = std::get<0>(tup); + EXPECT_EQ(t1_out.item().toFloat(), 1.0f); + EXPECT_TRUE(t1_out.is_same(t1)); - auto tb = std::get<1>(tup); - EXPECT_EQ(tb.item().toFloat(), 2.0f); - EXPECT_TRUE(b.is_same(tb)); + auto t2_out = std::get<1>(tup); + EXPECT_EQ(t2_out.item().toFloat(), 2.0f); + EXPECT_TRUE(t2_out.is_same(t2)); } } +// functional, boxed calling + TEST(KernelFunctionTest, givenBoxedFunction_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_return>(); kernels::expectBoxedCallingWithReturnWorks(func); @@ -360,16 +461,30 @@ TEST(KernelFunctionTest, givenBoxedFunction_withMultiReturn_whenCallingBoxed_the kernels::expectBoxedCallingWithMultiReturnWorks(func); } -TEST(KernelFunctionTest, givenBoxedFunction_withTensorRefReturn_whenCallingBoxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_tensor_ref_return>(); - kernels::expectBoxedCallingWithTensorRefReturnWorks(func); +// in/out, boxed calling + +TEST(KernelFunctionTest, givenBoxedFunction_withInPlaceSignature_whenCallingBoxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_inplace_op>(); + kernels::expectInPlaceBoxedCallingWorks(func); +} + +TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceSignature_whenCallingBoxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_op>(); + kernels::expectOutOfPlaceBoxedCallingWorks(func); } -TEST(KernelFunctionTest, givenBoxedFunction_withMultipleTensorRefReturn_whenCallingBoxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_multiple_tensor_ref_return>(); - kernels::expectBoxedCallingWithMultipleTensorRefReturnWorks(func); +TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceMultiSignature_whenCallingBoxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_multi_op>(); + kernels::expectOutOfPlaceMultiBoxedCallingWorks(func); } +TEST(KernelFunctionTest, givenBoxedFunction_withLegacyOutOfPlaceMultiSignature_whenCallingBoxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_legacy_outofplace_multi_op>(); + kernels::expectLegacyOutOfPlaceMultiBoxedCallingWorks(func); +} + +// functional, unboxed calling + TEST(KernelFunctionTest, givenBoxedFunction_withReturn_whenCallingUnboxed_thenWorks) { KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_return>(); kernels::expectUnboxedCallingWithReturnWorks(func); @@ -385,16 +500,30 @@ TEST(KernelFunctionTest, givenBoxedFunction_withMultiReturn_whenCallingUnboxed_t kernels::expectUnboxedCallingWithMultiReturnWorks(func); } -TEST(KernelFunctionTest, givenBoxedFunction_withTensorRefReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_tensor_ref_return>(); - kernels::expectUnboxedCallingWithTensorRefReturnWorks(func); +// in/out, unboxed calling + +TEST(KernelFunctionTest, givenBoxedFunction_withInPlaceSignature_whenCallingUnboxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_inplace_op>(); + kernels::expectInPlaceUnboxedCallingWorks(func); } -TEST(KernelFunctionTest, givenBoxedFunction_withMultipleTensorRefReturn_whenCallingUnboxed_thenWorks) { - KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_with_multiple_tensor_ref_return>(); - kernels::expectUnboxedCallingWithMultipleTensorRefReturnWorks(func); +TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceSignature_whenCallingUnboxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_op>(); + kernels::expectOutOfPlaceUnboxedCallingWorks(func); } +TEST(KernelFunctionTest, givenBoxedFunction_withOutOfPlaceMultiSignature_whenCallingUnboxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_outofplace_multi_op>(); + kernels::expectOutOfPlaceMultiUnboxedCallingWorks(func); +} + +TEST(KernelFunctionTest, givenBoxedFunction_withLegacyOutOfPlaceMultiSignature_whenCallingUnboxed_thenWorks) { + KernelFunction func = KernelFunction::makeFromBoxedFunction<&kernels::boxed_func_for_legacy_outofplace_multi_op>(); + kernels::expectLegacyOutOfPlaceMultiUnboxedCallingWorks(func); +} + +// functors etc. + TEST(KernelFunctionTest, givenUnboxedFunctor_withReturn_whenCallingBoxed_thenWorks) { KernelFunction func = KernelFunction::makeFromUnboxedFunctor(std::unique_ptr(std::make_unique())); kernels::expectBoxedCallingWithReturnWorks(func); diff --git a/aten/src/ATen/core/boxing/impl/boxing.h b/aten/src/ATen/core/boxing/impl/boxing.h index 484d462b8ad9..4f9ae1fced70 100644 --- a/aten/src/ATen/core/boxing/impl/boxing.h +++ b/aten/src/ATen/core/boxing/impl/boxing.h @@ -71,50 +71,20 @@ using can_unbox = >; // -// BoxedKernelWrapper -// -// For a given function type FT, BoxedKernelWrapper implements -// -// 1. a `boxArgs` method that boxes the function's arguments - i.e., -// inserts each argument into an IValue that it pushes onto a -// torch::jit::Stack, which it returns -// -// 2. a `call` method that -// - takes a boxed kernel and unboxed arguments as specified by FT, -// - calls `boxArgs` to box the arguments -// - calls the boxed kernel -// - unboxes and returns the result +// boxArgs - utility for pushing unboxed args onto IValue stack // -// The partial specializations below handle various cases: in -// particular, not all types appearing in op signatures are supported, -// and ops returning references have nonstandard wrapper implementations. -// - -// 1. The base specialization of BoxedKernelWrapper should never be instantiated. -// A "no call method defined on BoxedKernelWrapper" compile error means that -// an op signature has failed to trigger any of the partial specializations -// that follow this one. -// -template -struct BoxedKernelWrapper { - // The reason we're not just doing straight up static_assert(false, ...) here: - // Basically, the way to make sure a static_assert only fires if a template - // is actually instantiated (rather than every time the file is parsed) is to use - // template parameters in the expression, e.g. FuncType here. However, since - // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same - // effect. - static_assert(sizeof(FuncType) != sizeof(FuncType), - "Function signature contains one or more unsupported parameter and/or return types. " - "Look for a nearby error like " - "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" " - "- (your function type) is the unsupported signature."); -}; +template +static torch::jit::Stack boxArgs(Args... args) { + // TODO Reuse stack vector instead of allocating? + torch::jit::Stack stack; + stack.reserve(sizeof...(Args)); + torch::jit::push(stack, std::forward(args)...); + return stack; +} // -// 2. Supported signatures, other than ref-passing. -// - -// helper class whose specializations handle single and multiple return values, respectively +// PopResult is a helper class whose specializations handle popping single and +// multiple return values, respectively. // template struct PopResult final { @@ -151,6 +121,46 @@ struct PopResult> final { } }; +// +// BoxedKernelWrapper +// +// For a given function type FT, BoxedKernelWrapper implements +// a `call` method that +// - takes a boxed kernel and unboxed arguments as specified by FT, +// - calls `boxArgs` to box the arguments +// - calls the boxed kernel +// - unboxes and returns the result +// +// The partial specializations below handle various cases: in +// particular, not all types appearing in op signatures are supported, +// and ops returning references have nonstandard wrapper implementations. +// + +// 1. The base specialization of BoxedKernelWrapper should never be instantiated. +// A "no call method defined on BoxedKernelWrapper" compile error means that +// an op signature has failed to trigger any of the partial specializations +// that follow this one. +// +template +struct BoxedKernelWrapper { + // The reason we're not just doing straight up static_assert(false, ...) here: + // Basically, the way to make sure a static_assert only fires if a template + // is actually instantiated (rather than every time the file is parsed) is to use + // template parameters in the expression, e.g. FuncType here. However, since + // `sizeof(FuncType) != sizeof(FuncType)` is always false, this has the same + // effect. + static_assert(sizeof(FuncType) != sizeof(FuncType), + "Function signature contains one or more unsupported parameter and/or return types. " + "Look for a nearby error like " + "\"'call' is not a member of 'c10::impl::BoxedKernelWrapper<(your function type), void>'\" " + "- (your function type) is the unsupported signature."); +}; + +// +// 2. Supported signatures, other than those involving non-const Tensor refs - +// i.e., "functional" ops. +// + template struct BoxedKernelWrapper< Result(Args...), @@ -159,14 +169,6 @@ struct BoxedKernelWrapper< void > > { - static torch::jit::Stack boxArgs(Args... args) { - // TODO Reuse stack vector instead of allocating? - torch::jit::Stack stack; - stack.reserve(sizeof...(Args)); - torch::jit::push(stack, std::forward(args)...); - return stack; - } - static Result call( KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func, OperatorKernel* functor, @@ -194,12 +196,15 @@ struct BoxedKernelWrapper< }; // -// 3. signatures taking a single Tensor reference as their first argument, -// and also returning one. +// 3. in-place and legacy out-of-place ops take a single non-const Tensor +// reference as their first argument, and return it. +// +// Note: all signatures matching this pattern are are assumed to be for such ops. +// Because of this, the generated BoxedKernelWrapper specializations simply +// return the in-place argument. // -// Note that the passed kernels are assumed to be for inplace/outplace ops, -// and the generated BoxedKernelWrapper specializations will simply return -// the initial argument. +// TODO update comment when legacy out-of-place signatures no longer need +// to be supported, due to hacky_wrapper reordering // template @@ -207,21 +212,11 @@ struct BoxedKernelWrapper< at::Tensor&(at::Tensor&, OtherArgs...), std::enable_if_t::value, void> > { - static torch::jit::Stack boxArgs(at::Tensor& outArg, OtherArgs... otherArgs) { - // TODO Reuse stack vector instead of allocating? - torch::jit::Stack stack; - stack.reserve(1 + sizeof...(OtherArgs)); - torch::jit::push_one(stack, outArg); - torch::jit::push(stack, std::forward(otherArgs)...); - return stack; - } - static at::Tensor& call( KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func, OperatorKernel* functor, const OperatorHandle& opHandle, - at::Tensor& outArg, - OtherArgs... otherArgs + at::Tensor& outArg, OtherArgs... otherArgs ) { torch::jit::Stack stack = boxArgs(outArg, otherArgs...); (*boxed_kernel_func)(functor, opHandle, &stack); @@ -236,30 +231,75 @@ struct BoxedKernelWrapper< }; // -// 4. signatures returning a tuple of Tensor references, and taking the same -// number of Tensor refs as their initial arguments. +// 4. out of place ops that take a single non-const Tensor reference as their +// final argument, and also return it. // -// Note that the passed kernels are assumed to be for inplace/outplace ops, -// and the generated BoxedKernelWrapper specializations will return a tuple -// of those initial arguments. +// Note: all signatures matching this pattern are are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to simply +// return out arguments. // +template +struct BoxedKernelWrapper< + at::Tensor&(FirstArg, RestArgs...), + std::enable_if_t< + can_box_all::value + // this skips over in-place (and legacy out-of-place) kernels with a non-const Tensor + // arg at the front, so those can unambiguously trigger the preceding specialization. + // TODO update comment when hacky_wrapper reorders legacy out-of-place signatures + && !is_mutable_tensor_ref::value, + void + > +> { + static at::Tensor& call( + KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func, + OperatorKernel* functor, + const OperatorHandle& opHandle, + FirstArg firstArg, RestArgs... restArgs + ) { + torch::jit::Stack stack = boxArgs(firstArg, restArgs...); + (*boxed_kernel_func)(functor, opHandle, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == 1, + "Boxed kernel was expected to return a single value on the stack, ", + "but instead returned ", stack.size(), " values." + ); + + return std::get(std::tuple{restArgs...}); + } +}; +// +// 5. out of place ops that take multiple non-const Tensor references as their +// final arguments, and return them in a std::tuple. +// +// Note: all signatures matching this pattern are are assumed to be for such ops. +// This assumption permits the generated BoxedKernelWrapper specializations to simply +// return the out arguments. +// template struct BoxedKernelWrapper< Result(Args...), std::enable_if_t< - can_box_all::value && is_tuple_of_mutable_tensor_refs::value, + can_box_all::value && is_tuple_of_mutable_tensor_refs::value + // this test skips over legacy kernels with out args at the front, so they can trigger + // the specialization that follows. + // note: this test is complicated by the fact that boolean value expressions in templates + // don't shortcut. some signatures have a result tuple that's wider than the arg list, and + // without the length limiting ternary these will cause a template evaluation error on this + // test, even if a length check precedes it in the conjunction. + // TODO remove when hacky_wrapper reorders legacy kernel out args + && !std::is_same< + Result, + guts::typelist::to_tuple_t< + guts::typelist::take_t< + guts::typelist::typelist, + sizeof...(Args) >= std::tuple_size::value ? std::tuple_size::value : sizeof...(Args) + > + > + >::value, void > > { - static torch::jit::Stack boxArgs(Args... args) { - // TODO Reuse stack vector instead of allocating? - torch::jit::Stack stack; - stack.reserve(sizeof...(Args)); - torch::jit::push(stack, std::forward(args)...); - return stack; - } - static Result call( KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func, OperatorKernel* functor, @@ -277,15 +317,65 @@ struct BoxedKernelWrapper< "but instead returned ", stack.size(), " values." ); - auto result = guts::tuple_take(ArgTuple{args...}); + auto result = guts::tuple_take(ArgTuple{args...}); static_assert( std::is_same::value, "The parameter list of an op returning a tuple of Tensor references " - "must begin with an equal number of Tensor reference parameters." + "must end with an equal number of Tensor reference parameters." ); return result; } }; +// +// 6. legacy trap for old-school multi-return out functions with mutable args +// at start rather than end of arg list. +// TODO remove when hacky_wrapper reorders legacy kernel out args +// + +template +struct BoxedKernelWrapper< + Result(Args...), + std::enable_if_t< + can_box_all::value && is_tuple_of_mutable_tensor_refs::value + // this test fires passes for legacy kernels with out args at the front. + // note: this test is complicated by the fact that boolean value expressions in templates + // don't shortcut. some signatures have a result tuple that's wider than the arg list, and + // without the length limiting ternary these will cause a template evaluation error on this + // test, even if a length check precedes it in the conjunction. + && std::is_same< + Result, + guts::typelist::to_tuple_t< + guts::typelist::take_t< + guts::typelist::typelist, + sizeof...(Args) >= std::tuple_size::value ? std::tuple_size::value : sizeof...(Args) + > + > + >::value, + void + > +> { + static Result call( + KernelFunction::InternalBoxedKernelFunction* boxed_kernel_func, + OperatorKernel* functor, + const OperatorHandle& opHandle, + Args... args + ) { + using ArgTuple = std::tuple; + constexpr int RetCount = std::tuple_size(); + + torch::jit::Stack stack = boxArgs(args...); + (*boxed_kernel_func)(functor, opHandle, &stack); + TORCH_INTERNAL_ASSERT_DEBUG_ONLY( + stack.size() == RetCount, + "Boxed kernel was expected to return ", RetCount, " values on the stack, ", + "but instead returned ", stack.size(), " values." + ); + + auto legacy_result = guts::tuple_take(ArgTuple{args...}); + return legacy_result; + } +}; + } // impl } // c10 diff --git a/aten/src/ATen/core/dispatch/Dispatcher.h b/aten/src/ATen/core/dispatch/Dispatcher.h index a5f9354d7ca2..8c5fec73308e 100644 --- a/aten/src/ATen/core/dispatch/Dispatcher.h +++ b/aten/src/ATen/core/dispatch/Dispatcher.h @@ -387,7 +387,7 @@ inline Return Dispatcher::callWithDispatchKey(const TypedOperatorHandle::boxArgs(args...); + torch::jit::Stack stack = impl::boxArgs(args...); guard.before(op, stack, seq_num); } else { guard.before(op, seq_num); diff --git a/aten/src/ATen/core/ivalue.h b/aten/src/ATen/core/ivalue.h index 88303c524aa1..9ea18dc8482d 100644 --- a/aten/src/ATen/core/ivalue.h +++ b/aten/src/ATen/core/ivalue.h @@ -82,6 +82,19 @@ struct OptionalArray { } }; +// Capsule is an internal implementation detail of custom C++ classes. We +// define it as an owning wrapper for +// c10::intrusive_ptr This wrapper is here to serve as +// an abstraction of the type erased custom class object pointer. It also allow +// pybind11 to treat this as a standalone class to register as a separate type +// caster, instead of a custom pointer holder which the pointer holder type +// caster try to "unwrap" it automatically. +struct Capsule { + c10::intrusive_ptr obj_ptr; + explicit Capsule(c10::intrusive_ptr ptr) + : obj_ptr(std::move(ptr)) {} +}; + // IValue is the generic tagged union used by the interpreter to hold // all value types. // It is a 16-byte object with an 8-byte payload and an 8-byte tag. @@ -327,8 +340,7 @@ struct CAFFE2_API IValue final { /// @private [doxygen private] c10::intrusive_ptr toBlob() const&; - // Capsule. Capsule is an internal implementation detail - // of custom C++ classes. No new callsites of these APIs should + // Capsule. No new callsites of these APIs should // be introduced. static inline IValue make_capsule( intrusive_ptr blob); diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_double.h b/aten/src/ATen/cpu/vec256/vec256_complex_double.h index 1df674383fb3..83a5753fabd2 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_double.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_double.h @@ -275,18 +275,7 @@ template <> class Vec256> { return _mm256_round_pd(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); } Vec256> sqrt() const { - // sqrt(a + bi) - // = sqrt(2)/2 * [sqrt(sqrt(a**2 + b**2) + a) + sgn(b)*sqrt(sqrt(a**2 + b**2) - a)i] - // = sqrt(2)/2 * [sqrt(abs() + a) + sgn(b)*sqrt(abs() - a)i] - - const __m256d scalar = _mm256_set1_pd(std::sqrt(2)/2); //sqrt(2)/2 sqrt(2)/2 - const __m256d sign_mask = _mm256_setr_pd(0.0, -0.0, 0.0, -0.0); - auto sign = _mm256_and_pd(values, sign_mask); - auto factor = _mm256_or_pd(scalar, sign); - - auto a_a = _mm256_xor_pd(_mm256_movedup_pd(values), sign_mask); // a -a - auto res_re_im = _mm256_sqrt_pd(_mm256_add_pd(abs_(), a_a)); // sqrt(abs + a) sqrt(abs - a) - return _mm256_mul_pd(factor, res_re_im); + return map(std::sqrt); } Vec256> reciprocal() const; Vec256> rsqrt() const { diff --git a/aten/src/ATen/cpu/vec256/vec256_complex_float.h b/aten/src/ATen/cpu/vec256/vec256_complex_float.h index dc8ef7cc76d6..28032651f636 100644 --- a/aten/src/ATen/cpu/vec256/vec256_complex_float.h +++ b/aten/src/ATen/cpu/vec256/vec256_complex_float.h @@ -313,18 +313,7 @@ template <> class Vec256> { return _mm256_round_ps(values, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC)); } Vec256> sqrt() const { - // sqrt(a + bi) - // = sqrt(2)/2 * [sqrt(sqrt(a**2 + b**2) + a) + sgn(b)*sqrt(sqrt(a**2 + b**2) - a)i] - // = sqrt(2)/2 * [sqrt(abs() + a) + sgn(b)*sqrt(abs() - a)i] - - const __m256 scalar = _mm256_set1_ps(std::sqrt(2)/2); //sqrt(2)/2 sqrt(2)/2 - const __m256 sign_mask = _mm256_setr_ps(0.0, -0.0, 0.0, -0.0, 0.0, -0.0, 0.0, -0.0); - auto sign = _mm256_and_ps(values, sign_mask); - auto factor = _mm256_or_ps(scalar, sign); - - auto a_a = _mm256_xor_ps(_mm256_moveldup_ps(values), sign_mask); // a -a - auto res_re_im = _mm256_sqrt_ps(_mm256_add_ps(abs_(), a_a)); // sqrt(abs + a) sqrt(abs - a) - return _mm256_mul_ps(factor, res_re_im); + return map(std::sqrt); } Vec256> reciprocal() const; Vec256> rsqrt() const { diff --git a/aten/src/ATen/native/Copy.cpp b/aten/src/ATen/native/Copy.cpp index 360069998f19..1780a553d73d 100644 --- a/aten/src/ATen/native/Copy.cpp +++ b/aten/src/ATen/native/Copy.cpp @@ -13,6 +13,11 @@ #include #include +#ifdef USE_FBGEMM +#include +#include +#endif + namespace { using namespace at; @@ -94,6 +99,31 @@ static Tensor & copy_impl(Tensor & self, const Tensor & src, bool non_blocking) TORCH_CHECK(self.defined(), "self is undefined"); TORCH_CHECK(src.defined(), "src is undefined"); + // FBGeMM kernel support exists only for the following case, + // 1. Memory Format for source and destination tensors is contiguous. + // 2. Device for both the source and destination tensor is CPU. + // 3. dtype conversion between FP32->FP16 and FP16->FP32. + #ifdef USE_FBGEMM + if (((self.dtype() == at::kFloat && src.dtype() == at::kHalf) || + (self.dtype() == at::kHalf && src.dtype() == at::kFloat)) && + (self.device().is_cpu() && src.device().is_cpu()) && + !self.is_sparse() && !src.is_sparse() && + ((self.is_contiguous() && src.is_contiguous()) || + (self.is_non_overlapping_and_dense() && self.strides() == src.strides()))) { + if (src.dtype() == at::kFloat && self.dtype() == at::kHalf) { + auto* output_ptr = reinterpret_cast( + self.data_ptr()); + fbgemm::FloatToFloat16_simd(src.data_ptr(), output_ptr, self.numel()); + } else { + auto in_data = reinterpret_cast( + src.data_ptr()); + auto* output_ptr = self.data_ptr(); + fbgemm::Float16ToFloat_simd(in_data, output_ptr, self.numel()); + } + return self; + } + #endif + if (self.is_sparse() && src.is_sparse()) { return at::copy_sparse_to_sparse_(self, src, non_blocking); } else if (self.is_sparse() || src.is_sparse()) { diff --git a/aten/src/ATen/native/ForeachOpsKernels.cpp b/aten/src/ATen/native/ForeachOpsKernels.cpp index 7dd96b1c8d99..5066ca529869 100644 --- a/aten/src/ATen/native/ForeachOpsKernels.cpp +++ b/aten/src/ATen/native/ForeachOpsKernels.cpp @@ -149,48 +149,20 @@ void foreach_tensor_##OP##_scalarlist_slow_(TensorList input, TensorList tensors FOREACH_BINARY_OP_LIST_ALPHA(add); FOREACH_BINARY_OP_LIST_ALPHA(sub); - FOREACH_BINARY_OP_SCALAR(add); FOREACH_BINARY_OP_SCALAR(sub); FOREACH_BINARY_OP_SCALAR(mul); FOREACH_BINARY_OP_SCALAR(div); - FOREACH_BINARY_OP_SCALARLIST(add); FOREACH_BINARY_OP_SCALARLIST(sub); FOREACH_BINARY_OP_SCALARLIST(mul); FOREACH_BINARY_OP_SCALARLIST(div); - FOREACH_BINARY_OP_LIST(mul); FOREACH_BINARY_OP_LIST(div); - FOREACH_UNARY_OP(sqrt); FOREACH_UNARY_OP(exp); -FOREACH_UNARY_OP(abs); -FOREACH_UNARY_OP(acos); -FOREACH_UNARY_OP(asin); -FOREACH_UNARY_OP(atan); -FOREACH_UNARY_OP(ceil); -FOREACH_UNARY_OP(cos); -FOREACH_UNARY_OP(cosh); -FOREACH_UNARY_OP(erf); -FOREACH_UNARY_OP(erfc); -FOREACH_UNARY_OP(expm1); -FOREACH_UNARY_OP(floor); -FOREACH_UNARY_OP(log); -FOREACH_UNARY_OP(log10); -FOREACH_UNARY_OP(log1p); -FOREACH_UNARY_OP(log2); -FOREACH_UNARY_OP(neg); -FOREACH_UNARY_OP(tan); -FOREACH_UNARY_OP(tanh); -FOREACH_UNARY_OP(sin); -FOREACH_UNARY_OP(sinh); -FOREACH_UNARY_OP(round); -FOREACH_UNARY_OP(lgamma); - FOREACH_POINTWISE_OP_SCALAR(addcdiv); FOREACH_POINTWISE_OP_SCALAR(addcmul); - FOREACH_POINTWISE_OP_SCALARLIST(addcdiv); FOREACH_POINTWISE_OP_SCALARLIST(addcmul); diff --git a/aten/src/ATen/native/Linear.cpp b/aten/src/ATen/native/Linear.cpp index c9e03aaa3b6b..6f66c7a120fe 100644 --- a/aten/src/ATen/native/Linear.cpp +++ b/aten/src/ATen/native/Linear.cpp @@ -136,241 +136,331 @@ static Tensor sumproduct_pair(const Tensor& left_, const Tensor& right_, IntArra return result; } -Tensor einsum(std::string eqn, TensorList tensors) { - constexpr size_t number_of_letters = 26; - std::string in_eqn; - size_t pos; - // The equation is given in terms of single lowercase letters ('a'..'z') and potentially an ellipsis. - // Internally, we represent it using indices from 0 to num_total_dimensions, with each letter - // mapped to an index and the ellipsis ('...') being mapped to a number of consequtive indices. - // The mapping of letters to internal indices is given in letter_mapping. A value of -1 means that - // the letter has not been assigned an index yet (because it has not been seen). - // The ellipsis is defined by first_ell_idx (the first index) and num_ell_idxes (the number of indices). - // A value of -1 for num_ell_idxes specifies that we have not seen an ellipsis yet. - // Note: The internal indices are NOT the dimensions used internally. There is a mapping to them below. - - std::array letter_mapping; // map letter to internal (numerical) label - letter_mapping.fill(-1); - int64_t num_ell_idxes = -1; - int64_t first_ell_idx = 0; - - // The internal representation of the left hand side fo the equation (with ellipsis expanded) is stored in input_op_idxes. - // For each operand, we have a vector mapping each dimension to an internal index. - // We also keep track of the number of occurrences for each letter (to infer a right hand side if not given) and - // of the last occurrence of each index. - std::vector> input_op_idxes; // the parsed operand indices - std::array num_letter_occurrences; // number of occurrence in the equation of this letter - num_letter_occurrences.fill(0); - std::vector last_idx_occurrence; // the last operator (left to right) using this index - - if ((pos = eqn.find("->")) != std::string::npos) { // check whether we have a right hand side. in_eq is the left hand side - in_eqn = eqn.substr(0, pos); - } else { - in_eqn = eqn; - } - // remove spaces for einsum compatibility (#9929) - in_eqn.erase(std::remove_if(in_eqn.begin(), in_eqn.end(), isspace), in_eqn.end()); - - // next we parse in_eq (the left hand side) by iterating. It is a string of comma separated terms per index - int64_t operand = 0; - std::stringstream eqn_stream(in_eqn); - std::string term; - int64_t num_total_idxes = 0; - while (! eqn_stream.eof()) { - std::getline(eqn_stream, term, ','); // term = string with indices of current term - TORCH_CHECK((int64_t) tensors.size()>operand, "more operands in equation than tensors"); // we cannot have a longer equation than operands. We need to check here before we use the dimension - - int64_t ell_char_count = 0; // handling of ellipsis '...' is a bit tedious, we count the '.' - // if there is an ellipsis, the number of dimensions it represents must be total dim - letter dimensions - int64_t candidate_num_ell_idxes = tensors[operand].dim() - term.size() + 3; - int64_t dims_in_term = 0; // dimensions we have seen - std::vector current_op_idxes; // mapping of operand dimensions to indices for current term - for (auto &c : term) { // c = character with a single letter or '.' - if (c == '.') { - ell_char_count++; - TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in term ", operand, " of the equation"); - if (ell_char_count == 3) { // this completes the ellipsis - if (num_ell_idxes == -1) { // if we have not seen an ellipsis before, keep track of indices and size - first_ell_idx = num_total_idxes; - num_ell_idxes = candidate_num_ell_idxes; - num_total_idxes += num_ell_idxes; - } - else { // we have seen an ellipsis before, so we check compatibility - TORCH_CHECK(candidate_num_ell_idxes == num_ell_idxes, - "ellipsis must represent ", num_ell_idxes, " dimensions in all terms"); - } - for (int64_t i = 0; i < num_ell_idxes; ++i) { // map ellipsis dimensions in operand to indices - current_op_idxes.push_back(first_ell_idx + i); - last_idx_occurrence.push_back(operand); - } - dims_in_term += num_ell_idxes; // keep track of dimensions - } - } else { // a letter (hopefully) - TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis, operand ", operand); - TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); - int64_t letter_num = c-'a'; // letter_num = position in letter_mapping - if (letter_mapping[letter_num] == -1) { // new letter, add internal index and mapping - letter_mapping[letter_num] = num_total_idxes; - num_total_idxes++; - last_idx_occurrence.push_back(operand); - } else { // letter we have already seen - last_idx_occurrence[letter_mapping[letter_num]] = operand; - } - num_letter_occurrences[letter_num]++; - current_op_idxes.push_back(letter_mapping[letter_num]); - dims_in_term++; - } +// There are roughly three parts to compute einsum: +// 1. Parse equation to extract the labels for each input operand and output +// 2. Unsqueeze missing dimensions from input operands and permute to align them +// 3. Compute result by multiplying input operands and summing contraction +// dimensions We do the last part by reducing to bmm. +Tensor einsum(std::string equation, TensorList operands) { + TORCH_CHECK(!operands.empty(), "einsum() must provide at least one operand"); + checkDeviceType("einsum()", operands, operands[0].device().type()); + + // Code for encoding ellipsis ("...") with labels + constexpr int ELLIPSIS = '.'; + + // Find arrow (->) to split equation into lhs and rhs + const auto arrow_pos = equation.find("->"); + + // Convert labels for input operands into an index in [0, 25] and store + // them in op_labels for each operand along with ELLIPSIS. + std::string lhs = equation.substr(0, arrow_pos); + std::vector> op_labels(operands.size()); + bool found_ell = false; + std::string::size_type curr_op = 0; + for (auto i = decltype(lhs.length()){0}; i < lhs.length(); ++i) { + switch (lhs[i]) { + case ' ': + // Ignore spaces + break; + + case '.': + TORCH_CHECK( + // Only one ellipsis per operand can be given + !found_ell, + "einsum() found \'.\' for operand ", + curr_op, + " for which an ellipsis was already found"); + TORCH_CHECK( + // Ensure it's a valid ellipsis + i + 2 < lhs.length() && lhs[++i] == '.' && lhs[++i] == '.', + "einsum() found \'.\' for operand ", + curr_op, + " that is not part of any ellipsis"); + op_labels[curr_op].push_back(ELLIPSIS); + found_ell = true; + break; + + case ',': + // Move onto next operand + ++curr_op; + TORCH_CHECK( + curr_op < operands.size(), + "einsum() fewer operands were provided than specified in the equation"); + found_ell = false; + break; + + default: + // Parse label + TORCH_CHECK( + lhs[i] >= 'a' && lhs[i] <= 'z', + "einsum() operand subscript must be in range [a, z] but found ", + lhs[i], + " for operand ", + curr_op); + // Convert label to index in [0, 25] and store + op_labels[curr_op].push_back(lhs[i] - 'a'); } - TORCH_CHECK(dims_in_term == tensors[operand].dim(), "dimension mismatch for operand ", operand, ": equation ", dims_in_term, " tensor ", tensors[operand].dim()); - input_op_idxes.push_back(std::move(current_op_idxes)); - operand++; } - // in the check below, we need ==, but > is captured above, so the error message can be specific that it is <. - TORCH_CHECK((int64_t) tensors.size()==operand, "more tensors than operands in equation"); - - // the following parses or infers output (right hand side) - // it also assigns the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) - // for the output indices. -1 means that the index has not been assigned a dimension yet - std::vector idxes_to_preprocessed_dims(num_total_idxes, -1); // the position of the index in the tensor dimensions - int64_t num_output_dims = 0; - if (pos != std::string::npos) { // parse the user provided right hand side - int64_t ell_char_count = 0; - for (auto &c : eqn.substr(pos+2)) { - if (c == '.') { // '.' as part of ellipsis - ell_char_count++; - TORCH_CHECK(ell_char_count <= 3, "can only have '.' in one ellispis '...' in right hand side of the equation"); - if (ell_char_count == 3) { // ellipsis complete - TORCH_CHECK(num_ell_idxes >= 0, "ellipsis '...' may only appear in right hand side if it does in left hand side"); - for (int64_t i = 0; i < num_ell_idxes; ++i) { - idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; - num_output_dims++; - } - } - } else if (! isspace(c)) { // letter (hopefully) - TORCH_CHECK((ell_char_count == 0) || (ell_char_count == 3), "'.' must only occur in ellipsis in the right hand side"); - TORCH_CHECK(('a' <= c) && (c <= 'z'), "only lowercase letters a-z allowed as indices"); - int64_t letter_num = c-'a'; - TORCH_CHECK(idxes_to_preprocessed_dims[letter_mapping[letter_num]] == -1, "index ", c, " occurs twice in output"); - idxes_to_preprocessed_dims[letter_mapping[letter_num]] = num_output_dims; - num_output_dims++; + + TORCH_CHECK( + curr_op == operands.size() - 1, + "einsum() more operands were provided than specified in the equation"); + + // Labels must be within [a, z]. + constexpr int total_labels = 'z' - 'a' + 1; + std::vector label_count(total_labels, 0); + + // The maximum number of dimensions covered by any ellipsis, needed when + // unsqueezing missing dimensions from operands to permute and broadcast + int64_t ell_num_dim = 0; + + // Compute label frequency and number of dimensions covered by ellipsis + // We do this after parsing labels to make it more readable and simpler + // to compute the number of dimensions covered by ellipsis. + for (std::size_t i = 0; i < operands.size(); ++i) { + Tensor operand = operands[i]; + std::vector labels = op_labels[i]; + int64_t nlabels = labels.size(); + int64_t ndims = operand.dim(); + bool has_ellipsis = false; + + for (int label : labels) { + if (label == ELLIPSIS) { + --nlabels; + has_ellipsis = true; + ell_num_dim = std::max(ell_num_dim, ndims - nlabels); + } else { + ++label_count[label]; } } - } else { // create an inferred right hand side - // the ellipsis (if in the lhs) comes first - if (num_ell_idxes >= 0) { - for (int64_t i = 0; i < num_ell_idxes; ++i) { - idxes_to_preprocessed_dims[first_ell_idx + i] = num_output_dims; - num_output_dims++; + + TORCH_CHECK( + has_ellipsis ? nlabels <= ndims : nlabels == ndims, + "einsum() the number of subscripts in the equation (", + nlabels, + has_ellipsis ? ") is more than the number of dimensions (" + : ") does not match the number of dimensions (", + ndims, + ") for operand ", + i, + has_ellipsis ? "" : " and no ellipsis was given"); + } + + // Mapping of label to index in the permuted tensors (out_dims + sum_dims) + // This will be used for aligning the dimensions of all input operands + std::vector label_perm_index(total_labels, -1); + + // Current index in the permuted shape + int perm_index = 0; + + // Start index of ellipsis dimensions in the permuted shape + int64_t ell_index = 0; + + if (arrow_pos == std::string::npos) { + // Implicit output is ellipsis (...) + labels seen only once + perm_index = ell_num_dim; + for (int label = 0; label < total_labels; ++label) { + if (label_count[label] == 1) { + label_perm_index[label] = perm_index++; } } - // then the indices that occur exactly once in alphabetic order - for (size_t idx = 0; idx < number_of_letters; idx++) { - if (num_letter_occurrences[idx] == 1) { - idxes_to_preprocessed_dims[letter_mapping[idx]] = num_output_dims; - num_output_dims++; + } else { + // Parse explicit output + std::string rhs = equation.substr(arrow_pos + 2); + found_ell = false; + for (std::size_t i = 0; i < rhs.length(); ++i) { + switch (rhs[i]) { + case ' ': + // Ignore spaces + break; + + case '.': + TORCH_CHECK( + // There can only be one ellipsis in the output + !found_ell, + "einsum() found \'.\' for output but an ellipsis (...) was already found"); + TORCH_CHECK( + // Ensure ellipsis is correct + i + 2 < rhs.length() && rhs[++i] == '.' && rhs[++i] == '.', + "einsum() found \'.\' for output that is not part of any ellipsis (...)"); + ell_index = perm_index; + perm_index += ell_num_dim; + found_ell = true; + break; + + default: + TORCH_CHECK( + rhs[i] >= 'a' && rhs[i] <= 'z', + "einsum() subscripts must be in range [a, z] but found ", + rhs[i], + " for the output"); + TORCH_CHECK( + // Ensure label appeared at least once for some input operand and at + // most once for the output + label_count[rhs[i] - 'a'] > 0, + "einsum() output subscript ", + rhs[i], + label_count[rhs[i] - 'a'] == -1 + ? " appears more than once in the output" + : " does not appear in the equation for any input operand"); + label_perm_index[rhs[i] - 'a'] = perm_index++; + + // Set to -1 to mark that this label already appeared in the output + label_count[rhs[i] - 'a'] = -1; } } + + TORCH_CHECK( + // Dimensions under ellipsis are not contracted, so ensure it appears in output + ell_num_dim <= 0 || found_ell, + "einsum() ellipsis (...) covering one or more dimensions was given in the input but not in the output"); } - // now we assign the idxes_to_preprocessed_dims (index -> dimension in preprocessed / output tensors) - // for the non-output indices - those that are eventually summed over - int64_t position = num_output_dims; - for (int64_t i = 0; i < num_total_idxes; i++) { - if (idxes_to_preprocessed_dims[i]==-1) { - idxes_to_preprocessed_dims[i] = position; - position++; + + // Save output size before adding sum dims + int out_size = perm_index; + + // Add contraction labels (labels not present in output) + for (int label = 0; label < total_labels; ++label) { + if (label_count[label] > 0 && label_perm_index[label] == -1) { + label_perm_index[label] = perm_index++; } } - // we now "homogenize the dimensions", i.e. - // - take diagonals for duplicated indices - // - permute the dimensions to match the order given by idxes_to_preprocessed_dims - // - unsqueeze to create all dimensions for each index in each tensor where they are missing - // we also check that sizes match - // after this, all operands will have compatible shapes (i.e. all dimensions are aligned are broadcastable) - std::vector preprocessed_operands; - std::vector size_of_dims(num_total_idxes, -1); // keep track of sizes for each index, -1 means we have not seen a size yet - for (int64_t op = 0; op < (int64_t) tensors.size(); op++) { - auto preprocessed_op = tensors[op]; - std::vector idx_to_dim(num_total_idxes, -1); // the dimension which the index refers to in the original tensor, -1 means it does not appear - std::vector& current_op_input_idxes = input_op_idxes[op]; - int64_t dim = 0; // there are two dimension indices: dim is after taking diagonals, i is in input - for (size_t i = 0; i < current_op_input_idxes.size(); i++) { - auto idx = current_op_input_idxes[i]; - auto dim_out = idxes_to_preprocessed_dims[idx]; - if (idx_to_dim[dim_out] == -1) { // first appearance - idx_to_dim[dim_out] = dim; - if (size_of_dims[idx] == -1) { // keep track of sizes - size_of_dims[idx] = preprocessed_op.size(dim); + // Here we unsqueeze missing dimensions to make all operands have the same + // number of dimensions. We take diagonals for repeated labels within the + // same operand. Finally we permute the operands to align dimensions as + // per the perm_out_index we computed above. + std::vector permuted_operands; + for (std::size_t i = 0; i < operands.size(); ++i) { + std::vector perm_shape(perm_index, -1); + std::vector label_dim(total_labels, -1); + std::vector labels = op_labels[i]; + Tensor operand = operands[i]; + std::size_t j = 0; + + for (int label : labels) { + if (label == ELLIPSIS) { + // Add missing dimensions under ellipsis + int64_t num_dim_diff = + ell_num_dim - (operand.dim() - labels.size() + 1); + for (int64_t k = 0; k < num_dim_diff; ++k) { + operand = operand.unsqueeze(j); } - else { - TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); + for (int64_t k = 0; k < ell_num_dim; ++k) { + perm_shape[ell_index + k] = j++; } - dim++; - } else { // duplicate dimension in tensor --> take diagonal of idx_to_dim[dim_out] and dim and put the diagonal dimension to idx_to_dim[dim_out] - TORCH_CHECK(size_of_dims[idx] == preprocessed_op.size(dim), "size of dimension does not match previous size, operand ", op, ", dim ", i); - preprocessed_op = preprocessed_op.diagonal(0, idx_to_dim[dim_out], dim); - // diagonal moves the diagonal dimension to the back - // now we permute the last dim back to idx_to_dim[dim_out] - std::vector perm(preprocessed_op.dim(), 0); - for (int64_t d = 0; d < preprocessed_op.dim(); d++) { - if (d == idx_to_dim[dim_out]) { - perm[d] = preprocessed_op.dim() - 1; - } else { - perm[d] = d - (d > idx_to_dim[dim_out]); - } - } - preprocessed_op = preprocessed_op.permute(perm); + } else if (label_dim[label] != -1) { + // Repeated label, take diagonal + int64_t dim = label_dim[label]; + TORCH_CHECK( + operand.size(j) == operand.size(dim), + "einsum() subscript ", + char(label + 'a'), + " is repeated for operand ", + i, + " but the sizes don't match, ", + operand.size(j), + " != ", + operand.size(dim)); + operand = operand.diagonal(0, j, dim).movedim(-1, dim); + } else { + // Lookup output index for label + label_dim[label] = j; + perm_shape[label_perm_index[label]] = j++; } } - // now we permute the dimensions in the right order - std::vector permutation; // permutation for this tensor - for (auto &d : idx_to_dim) { - if (d > -1) { - permutation.push_back(d); + + // Add dimensions for missing labels + for (int64_t& index : perm_shape) { + if (index == -1) { + operand = operand.unsqueeze(-1); + index = j++; } } - preprocessed_op = preprocessed_op.permute(permutation); - // finally, we insert dimensions for idxes not in the operand - for (size_t dim = 0; dim < idx_to_dim.size(); dim++) { - if (idx_to_dim[dim] == -1) { - preprocessed_op = preprocessed_op.unsqueeze(dim); + + permuted_operands.push_back(operand.permute(perm_shape)); + } + + // Check if operands broadcast and keep track of last operand with + // dimension size != 1 for optimizing reductions + std::vector dim_last_op(perm_index, 0); + bool has_zero_size_dim = false; + for (int dim = 0; dim < perm_index; ++dim) { + int64_t broadcast_size = permuted_operands[0].size(dim); + for (std::size_t i = 1; i < permuted_operands.size(); ++i) { + int64_t dim_size = permuted_operands[i].size(dim); + if (broadcast_size != dim_size && broadcast_size != 1 && dim_size != 1) { + std::ostringstream msg; + msg << "einsum() operands do not broadcast with remapped shapes [original->remapped]:"; + for (std::size_t j = 0; j < operands.size(); ++j) { + msg << " " << operands[j].sizes() << "->" + << permuted_operands[j].sizes(); + } + TORCH_CHECK(false, msg.str()); + } + if (dim_size != 1) { + broadcast_size = dim_size; + dim_last_op[dim] = i; } } + has_zero_size_dim |= broadcast_size == 0; + } - preprocessed_operands.push_back(std::move(preprocessed_op)); + // Compute result + Tensor result = permuted_operands[0]; + + // Fast path for when an operand has zero sized dim + if (has_zero_size_dim) { + std::vector out_shape(out_size); + for (int i = 0; i < out_size; ++i) { + out_shape[i] = permuted_operands[dim_last_op[i]].size(i); + } + return at::zeros(out_shape, result.options()); } - // now we reduce the indices from left to right - // numpy allows to optimize the path using various - // algorithms (see eigen_path in numpy docs) - // we start with the leftmost operator and reduce indices that - // appear only there - Tensor result = std::move(preprocessed_operands[0]); - for (int64_t idx = 0; idx < num_total_idxes; idx++) { - if ((last_idx_occurrence[idx] == 0) - && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { - result = result.sum(idxes_to_preprocessed_dims[idx], true); + // Sum out or squeeze dimensions that are size 1 for all later operands + int dim = out_size; + for (int i = dim; i < perm_index; ++i, ++dim) { + if (dim_last_op[i] == 0) { + if (result.size(dim) == 1) { + result = result.squeeze(dim--); + } else { + result = result.sum(dim--); + } } } - // now we process each tensor using sumproduct_pair - for (int64_t i = 1; i < (int64_t) preprocessed_operands.size(); i++) { + for (std::size_t i = 1; i < permuted_operands.size(); ++i) { + Tensor operand = permuted_operands[i]; std::vector sum_dims; - for (int64_t idx = 0; idx < num_total_idxes; idx++) { - if ((last_idx_occurrence[idx] == i) - && (idxes_to_preprocessed_dims[idx]>=num_output_dims)) { - sum_dims.push_back(idxes_to_preprocessed_dims[idx]); + + // Sum out or squeeze dimensions that are size 1 for all later operands + dim = out_size; + for (int j = dim; j < perm_index; ++j, ++dim) { + if (dim_last_op[j] < i) { + operand = operand.squeeze(dim); + --dim; + } else if (dim_last_op[j] == i) { + if (result.size(dim) == 1) { + operand = operand.sum(dim); + result = result.squeeze(dim); + --dim; + } else { + sum_dims.push_back(dim); + } } } - result = at::native::sumproduct_pair(result, std::move(preprocessed_operands[i]), sum_dims, true); - } - // finally, we squeeze out all non-result dimensions - auto sizes = result.sizes().vec(); - for (int64_t dim = num_total_idxes-1; dim >= num_output_dims; dim--) { - sizes.erase(sizes.begin() + dim); + + // Multiply tensors and sum out dimensions in sum_dims + if (sum_dims.empty()) { + result = result.mul(operand); + } else if (sum_dims.size() == result.sizes().size()) { + result = result.flatten().dot(operand.flatten()); + } else { + result = sumproduct_pair(result, operand, sum_dims, false); + } } - result = result.view(sizes); return result; } diff --git a/aten/src/ATen/native/ReduceOps.cpp b/aten/src/ATen/native/ReduceOps.cpp index 0b3f9e518b6e..64aaea298093 100644 --- a/aten/src/ATen/native/ReduceOps.cpp +++ b/aten/src/ATen/native/ReduceOps.cpp @@ -730,8 +730,6 @@ Tensor all(const Tensor& self) { "all only supports CPU AND CUDA device type, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, "all only supports strided layout, got: ", self.layout()); - TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool, - "all only supports torch.uint8 and torch.bool dtypes"); Tensor result = at::empty({0}, self.options()); auto iter = make_reduction( @@ -749,8 +747,7 @@ Tensor &all_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { "all only supports CPU AND CUDA device type, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, "all only supports strided layout, got: ", self.layout()); - TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool, - "all only supports torch.uint8 and torch.bool dtypes"); + dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial(result, self, 1, dim, keepdim)) { return result; @@ -776,8 +773,6 @@ Tensor any(const Tensor& self) { "any only supports CPU AND CUDA device type, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided || self.layout() == Layout::Sparse, "any only supports strided AND sparse layout, got: ", self.layout()); - TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool, - "all only supports torch.uint8 and torch.bool dtypes"); Tensor result = at::empty({0}, self.options()); auto iter = make_reduction( @@ -795,8 +790,7 @@ Tensor &any_out(Tensor &result, const Tensor &self, int64_t dim, bool keepdim) { "any only supports CPU AND CUDA device type, got: ", self.device().type()); TORCH_CHECK(self.layout() == Layout::Strided, "any only supports strided layout, got: ", self.layout()); - TORCH_CHECK(self.scalar_type() == at::ScalarType::Byte || self.scalar_type() == at::ScalarType::Bool, - "all only supports torch.uint8 and torch.bool dtypes"); + dim = maybe_wrap_dim(dim, self.dim()); if (_dimreduce_return_trivial(result, self, 0, dim, keepdim)) { return result; diff --git a/aten/src/ATen/native/ReduceOpsUtils.h b/aten/src/ATen/native/ReduceOpsUtils.h index 205b08d86423..429b6f49a7bd 100644 --- a/aten/src/ATen/native/ReduceOpsUtils.h +++ b/aten/src/ATen/native/ReduceOpsUtils.h @@ -156,17 +156,15 @@ static void allocate_reduction_result( } static Tensor review_reduce_result(const Tensor& result, int ndim, DimMask mask, bool keepdim) { + if (keepdim) { + return result; + } auto shape = DimVector(result.sizes()); auto stride = DimVector(result.strides()); for (int dim = 0; dim < ndim; dim++) { if (mask[dim]) { - if (!keepdim) { - shape.insert(shape.begin() + dim, 1); - stride.insert(stride.begin() + dim, 0); - } else { - TORCH_INTERNAL_ASSERT(shape[dim] == 1); - stride[dim] = 0; - } + shape.insert(shape.begin() + dim, 1); + stride.insert(stride.begin() + dim, 0); } } return result.as_strided(shape, stride); diff --git a/aten/src/ATen/native/SharedReduceOps.h b/aten/src/ATen/native/SharedReduceOps.h index ae80a9e41be9..437a39bf2b92 100644 --- a/aten/src/ATen/native/SharedReduceOps.h +++ b/aten/src/ATen/native/SharedReduceOps.h @@ -340,6 +340,56 @@ struct NanSumOps { #endif }; +template +struct AndOps { + inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const { + return static_cast(a) && static_cast(b); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return static_cast(a) && static_cast(b); + } + + inline C10_DEVICE acc_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif +}; + +template +struct OrOps { + inline C10_DEVICE acc_t reduce(acc_t a, acc_t b, int64_t /*idx*/) const { + return static_cast(a) || static_cast(b); + } + + inline C10_DEVICE acc_t combine(acc_t a, acc_t b) const { + return static_cast(a) || static_cast(b); + } + + inline C10_DEVICE acc_t project(acc_t a) const { + return a; + } + + static C10_DEVICE acc_t translate_idx(acc_t acc, int64_t /*base_idx*/) { + return acc; + } + +#if defined(__CUDACC__) || defined(__HIPCC__) + inline C10_DEVICE acc_t warp_shfl_down(acc_t data, int offset) const { + return WARP_SHFL_DOWN(data, offset); + } +#endif +}; + namespace detail { template diff --git a/aten/src/ATen/native/TensorIterator.cpp b/aten/src/ATen/native/TensorIterator.cpp index 81a7ae3b9a9d..cb37b2055eda 100644 --- a/aten/src/ATen/native/TensorIterator.cpp +++ b/aten/src/ATen/native/TensorIterator.cpp @@ -542,15 +542,6 @@ void TensorIterator::coalesce_dimensions() { auto can_coalesce = [&](int dim0, int dim1) { auto shape0 = shape_[dim0]; auto shape1 = shape_[dim1]; - if (is_reduction_) { - // The dimension being reduced should not be coalesced - for (int i = 0; i < noutputs(); i++) { - auto& stride = operands_[i].stride_bytes; - if (stride[dim0] == 0 || stride[dim1] == 0) { - return false; - } - } - } if (shape0 == 1 || shape1 == 1) { return true; } @@ -811,7 +802,7 @@ void TensorIterator::narrow(int dim, int64_t start, int64_t size) { for (auto& op : operands_) { op.data = ((char*)op.data) + op.stride_bytes[dim] * start; } - if (size == 1) { + if (size == 1 && !is_reduction_) { coalesce_dimensions(); } } @@ -1406,24 +1397,4 @@ std::array DimCounter::max_2d_step() const { return {step0, step1}; } -std::ostream& operator<<(std::ostream& os, const TensorIterator& iter) { - os << "TensorIterator @ " << &iter << " {" << std::endl; - os << " ntensors() = " << iter.ntensors() << std::endl; - os << " noutputs() = " << iter.noutputs() << std::endl; - os << " shape() = " << iter.shape() << std::endl; - os << " strides(*) = {" << std::endl; - for (int i = 0; i < iter.ntensors(); i++) { - os << " (" << i << ") = " << iter.strides(i) << std::endl; - } - os << " }" << std::endl; - os << " dtype(*) = {" << std::endl; - for (int i = 0; i < iter.ntensors(); i++) { - os << " (" << i << ") = " << iter.dtype(i) << std::endl; - } - os << " }" << std::endl; - os << " is_reduction_ = " << iter.is_reduction_ << std::endl; - os << "}"; - return os; -} - } // namespace at diff --git a/aten/src/ATen/native/TensorIterator.h b/aten/src/ATen/native/TensorIterator.h index 3a9612e158ae..febf21a290dd 100644 --- a/aten/src/ATen/native/TensorIterator.h +++ b/aten/src/ATen/native/TensorIterator.h @@ -1,6 +1,5 @@ #pragma once -#include #include #include #include @@ -297,8 +296,6 @@ struct CAFFE2_API TensorIterator { return true; } - friend CAFFE2_API std::ostream& operator<<(std::ostream& os, const TensorIterator& iter); - protected: void build(TensorIteratorConfig&); @@ -536,6 +533,4 @@ struct CAFFE2_API SplitUntil32Bit { const TensorIterator& iter; }; -CAFFE2_API std::ostream& operator<<(std::ostream& os, const TensorIterator& iter); - } // namespace at diff --git a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp index d56582467894..c3207604f34a 100644 --- a/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp +++ b/aten/src/ATen/native/cpu/ReduceOpsKernel.cpp @@ -232,41 +232,55 @@ static void norm_kernel_tensor_iterator_impl( } static void and_kernel_impl(TensorIterator& iter) { - binary_kernel_reduce_vec( - iter, - [=](uint8_t a, uint8_t b) -> uint8_t { return a && b; }, - [=](Vec256 a, Vec256 b) { - // Adding the implementation here instead of in vec256_base to avoid - // return value inconsistency. Other comparison operators in vec256_base - // return -1/0 (all bit 1 / all bit 0) as true/false to follow the AVX2 - // convention. This would be convenient when combined with other - // vectorized operations. For example, one can use the logical operation - // results as a mask for a bit operation to retrieve/reset multiple - // elements in a vector. - // - // In this method, users would expect, e.g., all(), to return 1/0 as - // true/false. - Vec256 c = Vec256(); - for (int i = 0; i != Vec256::size(); i++) { - c[i] = a[i] && b[i]; - } - return c; - }, - /*ident=*/true); + if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) { + binary_kernel_reduce_vec( + iter, + [=](uint8_t a, uint8_t b) -> uint8_t { return (a && b) ? 1 : 0; }, + [=](Vec256 a, Vec256 b) { + // Adding the implementation here instead of in vec256_base to avoid + // return value inconsistency. Other comparison operators in + // vec256_base return -1/0 (all bit 1 / all bit 0) as true/false to + // follow the AVX2 convention. This would be convenient when combined + // with other vectorized operations. For example, one can use the + // logical operation results as a mask for a bit operation to + // retrieve/reset multiple elements in a vector. + // + // In this method, users would expect, e.g., all(), to return 1/0 as + // true/false. + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size(); i++) { + c[i] = (a[i] && b[i]) ? 1 : 0; + } + return c; + }, + /*ident=*/true); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "and_kernel", [&]() { + binary_kernel_reduce( + iter, AndOps(), static_cast(true)); + }); + } } static void or_kernel_impl(TensorIterator& iter) { - binary_kernel_reduce_vec( - iter, - [=](uint8_t a, uint8_t b) -> uint8_t { return a || b; }, - [=](Vec256 a, Vec256 b) { - Vec256 c = Vec256(); - for (int i = 0; i != Vec256::size(); i++) { - c[i] = a[i] || b[i]; - } - return c; - }, - /*ident=*/false); + if (c10::isIntegralType(iter.dtype(), /*includeBool=*/true)) { + binary_kernel_reduce_vec( + iter, + [=](uint8_t a, uint8_t b) -> uint8_t { return (a || b) ? 1 : 0; }, + [=](Vec256 a, Vec256 b) { + Vec256 c = Vec256(); + for (int i = 0; i != Vec256::size(); i++) { + c[i] = (a[i] || b[i]) ? 1 : 0; + } + return c; + }, + /*ident=*/false); + } else { + AT_DISPATCH_FLOATING_TYPES_AND(kHalf, iter.dtype(), "or_kernel", [&]() { + binary_kernel_reduce( + iter, OrOps(), static_cast(false)); + }); + } } template diff --git a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu index d6a8709ca967..13f278b0d900 100644 --- a/aten/src/ATen/native/cuda/ForeachUnaryOp.cu +++ b/aten/src/ATen/native/cuda/ForeachUnaryOp.cu @@ -4,86 +4,6 @@ namespace at { namespace native { -template class Op> -std::vector foreach_unary_op_complex(TensorList tensors) { - std::vector> tensor_lists; - std::vector vec_res; - vec_res.reserve(tensors.size()); - for (const auto& t: tensors) { - vec_res.emplace_back(at::native::empty_like(t)); - } - - tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(std::move(vec_res)); - - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<2>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); - return tensor_lists[1]; -} - -template class Op> -void foreach_unary_op_complex_(TensorList tensors) { - std::vector> tensor_lists; - tensor_lists.emplace_back(tensors.vec()); - - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<1>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); -} - -template class Op> -std::vector foreach_unary_op_complex_bfloat16(TensorList tensors) { - std::vector> tensor_lists; - std::vector vec_res; - vec_res.reserve(tensors.size()); - for (const auto& t: tensors) { - vec_res.emplace_back(at::native::empty_like(t)); - } - - tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(std::move(vec_res)); - - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<2>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); - return tensor_lists[1]; -} - -template class Op> -void foreach_unary_op_complex_bfloat16_(TensorList tensors) { - std::vector> tensor_lists; - tensor_lists.emplace_back(tensors.vec()); - - AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<1>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); -} - template class Op> std::vector foreach_unary_op(TensorList tensors) { std::vector> tensor_lists; @@ -96,7 +16,7 @@ std::vector foreach_unary_op(TensorList tensors) { tensor_lists.emplace_back(tensors.vec()); tensor_lists.emplace_back(std::move(vec_res)); - AT_DISPATCH_FLOATING_TYPES_AND(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { using opmath_t = get_opmath_t::opmath_t; multi_tensor_apply<2>(tensor_lists, UnaryOpFunctor> tensor_lists; tensor_lists.emplace_back(tensors.vec()); - AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<1>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); -} - -template class Op> -void foreach_op_unary_(TensorList tensors) { - std::vector> tensor_lists; - tensor_lists.emplace_back(tensors.vec()); - - AT_DISPATCH_FLOATING_TYPES_AND_HALF(tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<1>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); -} - -template class Op> -std::vector foreach_unary_op_bfloat16(TensorList tensors) { - std::vector> tensor_lists; - std::vector vec_res; - vec_res.reserve(tensors.size()); - for (const auto& t: tensors) { - vec_res.emplace_back(at::native::empty_like(t)); - } - - tensor_lists.emplace_back(tensors.vec()); - tensor_lists.emplace_back(std::move(vec_res)); - - AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda", [&]() { - using opmath_t = get_opmath_t::opmath_t; - multi_tensor_apply<2>(tensor_lists, - UnaryOpFunctor(), - Op()); - }); - return tensor_lists[1]; -} - -template class Op> -void foreach_unary_op_bfloat16_(TensorList tensors) { - std::vector> tensor_lists; - tensor_lists.emplace_back(tensors.vec()); - - AT_DISPATCH_FLOATING_TYPES_AND2(ScalarType::Half, ScalarType::BFloat16, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { + AT_DISPATCH_FLOATING_AND_COMPLEX_TYPES_AND1(ScalarType::Half, tensors[0].scalar_type(), "foreach_unary_op_cuda_", [&]() { using opmath_t = get_opmath_t::opmath_t; multi_tensor_apply<1>(tensor_lists, UnaryOpFunctor \ -struct NAME1 { \ - __device__ T operator()(T t) const { return std::NAME(t); } \ -}; \ - \ -std::vector foreach_tensor_##NAME##_cuda(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow(tensors); \ - } \ - \ - return foreach_unary_op_complex(tensors); \ -} \ - \ -void foreach_tensor_##NAME##_cuda_(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow_(tensors); \ - } \ - \ - foreach_unary_op_complex_(tensors); \ -} - -#define FOREACH_UNARY_OP_COMPLEX_BFLOAT16(NAME, NAME1) \ -template \ -struct NAME1 { \ - __device__ T operator()(T t) const { return std::NAME(t); } \ -}; \ - \ -std::vector foreach_tensor_##NAME##_cuda(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow(tensors); \ - } \ - \ - return foreach_unary_op_complex_bfloat16(tensors); \ -} \ - \ -void foreach_tensor_##NAME##_cuda_(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow_(tensors); \ - } \ - \ - foreach_unary_op_complex_bfloat16_(tensors); \ -} - #define FOREACH_UNARY_OP(NAME, NAME1) \ template \ struct NAME1 { \ @@ -252,101 +68,7 @@ void foreach_tensor_##NAME##_cuda_(TensorList tensors) { \ foreach_unary_op_(tensors); \ } -#define FOREACH_UNARY_OP_BFLOAT16(NAME, NAME1) \ -template \ -struct NAME1 { \ - __device__ T operator()(T t) const { return std::NAME(t); } \ -}; \ - \ -std::vector foreach_tensor_##NAME##_cuda(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow(tensors); \ - } \ - \ - return foreach_unary_op_bfloat16(tensors); \ -} \ - \ -void foreach_tensor_##NAME##_cuda_(TensorList tensors) { \ - check_foreach_api_restrictions(tensors); \ - \ - if (!can_use_fast_route(tensors)) { \ - return at::native::foreach_tensor_##NAME##_slow_(tensors); \ - } \ - \ - foreach_unary_op_bfloat16_(tensors); \ -} - -FOREACH_UNARY_OP(ceil, Ceil); -FOREACH_UNARY_OP(erfc, Erfc); -FOREACH_UNARY_OP(expm1, Expm1); -FOREACH_UNARY_OP(floor, Floor); -FOREACH_UNARY_OP(lgamma, Lgamma); - -FOREACH_UNARY_OP_BFLOAT16(log1p, Log1p); -FOREACH_UNARY_OP_BFLOAT16(erf, Erf); - -FOREACH_UNARY_OP_COMPLEX(acos, Acos); -FOREACH_UNARY_OP_COMPLEX(asin, Asin); -FOREACH_UNARY_OP_COMPLEX(atan, Atan); -FOREACH_UNARY_OP_COMPLEX(cosh, Cosh); -FOREACH_UNARY_OP_COMPLEX(tan, Tan); -FOREACH_UNARY_OP_COMPLEX(sin, Sin); -FOREACH_UNARY_OP_COMPLEX(sinh, Sinh); - -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(abs, Abs); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(exp, Exp); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(sqrt, Sqrt); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(cos, Cos); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(tanh, Tanh); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log, Log); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log10, Log10); -FOREACH_UNARY_OP_COMPLEX_BFLOAT16(log2, Log2); - -std::vector foreach_tensor_neg_cuda(TensorList tensors) { - check_foreach_api_restrictions(tensors); - - if (!can_use_fast_route(tensors)) { - return at::native::foreach_tensor_neg_slow(tensors); - } - - return foreach_unary_op_complex_bfloat16(tensors); -} - -void foreach_tensor_neg_cuda_(TensorList tensors) { - check_foreach_api_restrictions(tensors); - - if (!can_use_fast_route(tensors)) { - return at::native::foreach_tensor_neg_slow_(tensors); - } - - foreach_unary_op_complex_bfloat16_(tensors); -} - -template \ -struct Round { \ - __device__ T operator()(T t) const { return std::nearbyint(t); } \ -}; - -std::vector foreach_tensor_round_cuda(TensorList tensors) { - check_foreach_api_restrictions(tensors); - - if (!can_use_fast_route(tensors)) { - return at::native::foreach_tensor_round_slow(tensors); - } - - return foreach_unary_op(tensors); -} - -void foreach_tensor_round_cuda_(TensorList tensors) { - check_foreach_api_restrictions(tensors); - - if (!can_use_fast_route(tensors)) { - return at::native::foreach_tensor_round_slow_(tensors); - } - - foreach_unary_op_(tensors); -} +FOREACH_UNARY_OP(exp, Exp); +FOREACH_UNARY_OP(sqrt, Sqrt); }} // namespace at::native diff --git a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu index ca2db43637dd..a29a926ef257 100644 --- a/aten/src/ATen/native/cuda/ReduceLogicKernel.cu +++ b/aten/src/ATen/native/cuda/ReduceLogicKernel.cu @@ -8,17 +8,25 @@ namespace at { namespace native { void and_kernel_cuda(TensorIterator& iter) { - gpu_reduce_kernel( - iter, func_wrapper ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t { - return a && b; - }), true); + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "and_kernel", [&]() { + gpu_reduce_kernel( + iter, + func_wrapper([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return static_cast(static_cast(a) && static_cast(b)); + }), + static_cast(true)); + }); } void or_kernel_cuda(TensorIterator& iter) { - gpu_reduce_kernel( - iter, func_wrapper ([]GPU_LAMBDA(uint8_t a, uint8_t b) -> uint8_t { - return a || b; - }), false); + AT_DISPATCH_ALL_TYPES_AND2(kHalf, kBool, iter.dtype(), "or_kernel", [&]() { + gpu_reduce_kernel( + iter, + func_wrapper([] GPU_LAMBDA(scalar_t a, scalar_t b) -> scalar_t { + return static_cast(static_cast(a) || static_cast(b)); + }), + static_cast(false)); + }); } REGISTER_DISPATCH(and_stub, &and_kernel_cuda); diff --git a/aten/src/ATen/native/cuda/Sorting.cu b/aten/src/ATen/native/cuda/Sorting.cu index c6688b286914..889ccf606152 100644 --- a/aten/src/ATen/native/cuda/Sorting.cu +++ b/aten/src/ATen/native/cuda/Sorting.cu @@ -250,7 +250,7 @@ void kthvalue_cuda_template( int64_t dim_, bool keepdim) { int64_t dim = maybe_wrap_dim(dim_, self.dim()); - int64_t slicesize = self.size(dim); + int64_t slicesize = self.dim() == 0 ? 1 : self.size(dim); // FIXME: This seems bogus, I only do this because it was the old behaviour. // The reductions are fine, as long as the axis being reduced along // isn't of 0 elements (and the output has elements). diff --git a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm index e56845fd1f9e..bbcbfe10fd01 100644 --- a/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm +++ b/aten/src/ATen/native/metal/mpscnn/tests/MPSCNNTests.mm @@ -7,7 +7,6 @@ #import #include -#include #import #include diff --git a/aten/src/ATen/native/native_functions.yaml b/aten/src/ATen/native/native_functions.yaml index 0324ed352c95..2dcd3d234e46 100644 --- a/aten/src/ATen/native/native_functions.yaml +++ b/aten/src/ATen/native/native_functions.yaml @@ -7117,358 +7117,6 @@ CPU: foreach_tensor_sqrt_slow_ CUDA: foreach_tensor_sqrt_cuda_ -- func: _foreach_abs(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_abs_slow - CUDA: foreach_tensor_abs_cuda - -- func: _foreach_abs_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_abs_slow_ - CUDA: foreach_tensor_abs_cuda_ - -- func: _foreach_acos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_acos_slow - CUDA: foreach_tensor_acos_cuda - -- func: _foreach_acos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_acos_slow_ - CUDA: foreach_tensor_acos_cuda_ - -- func: _foreach_asin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_asin_slow - CUDA: foreach_tensor_asin_cuda - -- func: _foreach_asin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_asin_slow_ - CUDA: foreach_tensor_asin_cuda_ - -- func: _foreach_atan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_atan_slow - CUDA: foreach_tensor_atan_cuda - -- func: _foreach_atan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_atan_slow_ - CUDA: foreach_tensor_atan_cuda_ - -- func: _foreach_ceil(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_ceil_slow - CUDA: foreach_tensor_ceil_cuda - -- func: _foreach_ceil_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_ceil_slow_ - CUDA: foreach_tensor_ceil_cuda_ - -- func: _foreach_cos(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_cos_slow - CUDA: foreach_tensor_cos_cuda - -- func: _foreach_cos_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_cos_slow_ - CUDA: foreach_tensor_cos_cuda_ - -- func: _foreach_cosh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_cosh_slow - CUDA: foreach_tensor_cosh_cuda - -- func: _foreach_cosh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_cosh_slow_ - CUDA: foreach_tensor_cosh_cuda_ - -- func: _foreach_erf(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_erf_slow - CUDA: foreach_tensor_erf_cuda - -- func: _foreach_erf_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_erf_slow_ - CUDA: foreach_tensor_erf_cuda_ - -- func: _foreach_erfc(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_erfc_slow - CUDA: foreach_tensor_erfc_cuda - -- func: _foreach_erfc_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_erfc_slow_ - CUDA: foreach_tensor_erfc_cuda_ - -- func: _foreach_expm1(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_expm1_slow - CUDA: foreach_tensor_expm1_cuda - -- func: _foreach_expm1_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_expm1_slow_ - CUDA: foreach_tensor_expm1_cuda_ - -- func: _foreach_floor(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_floor_slow - CUDA: foreach_tensor_floor_cuda - -- func: _foreach_floor_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_floor_slow_ - CUDA: foreach_tensor_floor_cuda_ - -- func: _foreach_log(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log_slow - CUDA: foreach_tensor_log_cuda - -- func: _foreach_log_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log_slow_ - CUDA: foreach_tensor_log_cuda_ - -- func: _foreach_log10(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log10_slow - CUDA: foreach_tensor_log10_cuda - -- func: _foreach_log10_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log10_slow_ - CUDA: foreach_tensor_log10_cuda_ - -- func: _foreach_log1p(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log1p_slow - CUDA: foreach_tensor_log1p_cuda - -- func: _foreach_log1p_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log1p_slow_ - CUDA: foreach_tensor_log1p_cuda_ - -- func: _foreach_log2(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log2_slow - CUDA: foreach_tensor_log2_cuda - -- func: _foreach_log2_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_log2_slow_ - CUDA: foreach_tensor_log2_cuda_ - -- func: _foreach_neg(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_neg_slow - CUDA: foreach_tensor_neg_cuda - -- func: _foreach_neg_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_neg_slow_ - CUDA: foreach_tensor_neg_cuda_ - -- func: _foreach_tan(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_tan_slow - CUDA: foreach_tensor_tan_cuda - -- func: _foreach_tan_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_tan_slow_ - CUDA: foreach_tensor_tan_cuda_ - -- func: _foreach_tanh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_tanh_slow - CUDA: foreach_tensor_tanh_cuda - -- func: _foreach_tanh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_tanh_slow_ - CUDA: foreach_tensor_tanh_cuda_ - -- func: _foreach_sin(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sin_slow - CUDA: foreach_tensor_sin_cuda - -- func: _foreach_sin_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sin_slow_ - CUDA: foreach_tensor_sin_cuda_ - -- func: _foreach_sinh(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sinh_slow - CUDA: foreach_tensor_sinh_cuda - -- func: _foreach_sinh_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_sinh_slow_ - CUDA: foreach_tensor_sinh_cuda_ - -- func: _foreach_round(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_round_slow - CUDA: foreach_tensor_round_cuda - -- func: _foreach_round_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_round_slow_ - CUDA: foreach_tensor_round_cuda_ - -- func: _foreach_lgamma(Tensor[] tensors) -> Tensor[] - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_lgamma_slow - CUDA: foreach_tensor_lgamma_cuda - -- func: _foreach_lgamma_(Tensor(a!)[] self) -> () - use_c10_dispatcher: full - device_guard: False - variants: function - dispatch: - CPU: foreach_tensor_lgamma_slow_ - CUDA: foreach_tensor_lgamma_cuda_ - - func: _foreach_addcdiv_.Scalar(Tensor(a!)[] self, Tensor[] tensor1, Tensor[] tensor2, Scalar value=1) -> () use_c10_dispatcher: full device_guard: False diff --git a/aten/src/ATen/native/vulkan/VulkanAten.cpp b/aten/src/ATen/native/vulkan/VulkanAten.cpp index 6a781c3ab69b..18df3ae818f3 100644 --- a/aten/src/ATen/native/vulkan/VulkanAten.cpp +++ b/aten/src/ATen/native/vulkan/VulkanAten.cpp @@ -58,7 +58,7 @@ Tensor empty( const TensorOptions& options, const optional memory_format) { TORCH_CHECK( - !options.has_pinned_memory(), + !options.pinned_memory(), "'pin_memory' argument is incompatible with Vulkan tensor"); TORCH_CHECK( !options.has_memory_format() && !memory_format, @@ -519,6 +519,7 @@ Tensor mean( const IntArrayRef dim, const bool keepdim, const optional dtype) { + TORCH_INTERNAL_ASSERT(!keepdim, "keepdim not implemented for Vulkan mean"); TORCH_INTERNAL_ASSERT(self.is_vulkan(), "mean expects Vulkan tensor input"); // Mean is implemented only for HW dimensions of 4-d tensor @@ -541,7 +542,7 @@ Tensor mean( TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("slice.Tensor", TORCH_FN(at::native::vulkan::aten::slice)); - m.impl("reshape", TORCH_FN(at::native::vulkan::aten::reshape)); + m.impl("view", TORCH_FN(at::native::vulkan::aten::reshape)); m.impl("select.int", TORCH_FN(at::native::vulkan::aten::select)); m.impl("transpose.int", TORCH_FN(at::native::vulkan::aten::transpose)); m.impl_UNBOXED("transpose_", at::native::vulkan::aten::transpose_); @@ -567,8 +568,8 @@ TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("add.Scalar", TORCH_FN(at::native::vulkan::aten::add_scalar)); m.impl_UNBOXED( "convolution_overrideable", at::native::vulkan::aten::convolution); - m.impl_UNBOXED("hardtanh_", at::native::vulkan::aten::hardtanh_); - m.impl_UNBOXED("relu_", at::native::vulkan::aten::relu_); + m.impl("hardtanh_", at::native::vulkan::aten::hardtanh_); + m.impl("relu_", at::native::vulkan::aten::relu_); m.impl_UNBOXED("add_.Tensor", at::native::vulkan::aten::add_); } diff --git a/aten/src/ATen/native/vulkan/VulkanOps.cpp b/aten/src/ATen/native/vulkan/VulkanOps.cpp index f65e6b3336f5..8ad79a0c6f31 100644 --- a/aten/src/ATen/native/vulkan/VulkanOps.cpp +++ b/aten/src/ATen/native/vulkan/VulkanOps.cpp @@ -29,7 +29,6 @@ void upsample_nearest2d( float scaleH, float scaleW) { auto device = context().device(); - auto physicalDevice = context().physicalDevice(); int64_t C = IN * IC; struct ConstBlock { float scaleX; @@ -477,7 +476,6 @@ void add( auto W = os4[3]; auto device = context().device(); - auto physicalDevice = context().physicalDevice(); struct ConstBlock { float alpha; }; @@ -1115,10 +1113,8 @@ void clamp( auto C = sizes[0] * sizes[1]; auto H = sizes[2]; auto W = sizes[3]; - auto C_4 = UP_DIV(C, 4); auto device = context().device(); - auto physicalDevice = context().physicalDevice(); struct ConstBlock { float min; float max; @@ -1170,14 +1166,10 @@ void addmm( const auto m2Sizes = m2.sizes(); TORCH_INTERNAL_ASSERT(m1Sizes.size() == 2); TORCH_INTERNAL_ASSERT(m2Sizes.size() == 2); - const auto m1H = m1Sizes[0]; const auto m1W = m1Sizes[1]; const auto m1C = 1; - const auto m1C_4 = UP_DIV(m1C, 4); const auto m2H = m2Sizes[0]; - const auto m2W = m2Sizes[1]; const auto m2C = 1; - const auto m2C_4 = UP_DIV(m2C, 4); const auto OH = m1Sizes[0]; const auto OW = m2Sizes[1]; @@ -1186,7 +1178,6 @@ void addmm( const auto C = m1C; const auto C_4 = UP_DIV(C, 4); - const auto K = m1W; auto device = context().device(); @@ -1206,15 +1197,14 @@ void addmm( VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, + VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }; } else { descriptorTypes = { VK_DESCRIPTOR_TYPE_STORAGE_IMAGE, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, VK_DESCRIPTOR_TYPE_COMBINED_IMAGE_SAMPLER, - VK_DESCRIPTOR_TYPE_UNIFORM_BUFFER, }; } @@ -1228,9 +1218,9 @@ void addmm( output.image()->bindStorageImage(descriptorSet, 0); m1.image()->bindShaderRead(descriptorSet, 1); m2.image()->bindShaderRead(descriptorSet, 2); - constBuffer.bind(descriptorSet, 3); if (hasT) { - (*t).image()->bindShaderRead(descriptorSet, 4); + (*t).image()->bindShaderRead(descriptorSet, 3); + constBuffer.bind(descriptorSet, 4); } WorkGroupSize workGroupSize{8, 8, 1}; @@ -1268,17 +1258,13 @@ void mean(VulkanTensor& output, const VulkanTensor& input) { int32_t C = safe_downcast(isizes[1]); int32_t H = safe_downcast(isizes[2]); int32_t W = safe_downcast(isizes[3]); - int32_t C_4 = UP_DIV(N * C, 4); auto device = context().device(); - auto physicalDevice = context().physicalDevice(); struct ConstBlock { int32_t W; int32_t H; - int32_t OW; - int32_t OH; }; - ConstBlock cb{W, H, C, N}; + ConstBlock cb{W, H}; VBuffer constBuffer = makeUniformConstBuffer((void*)&cb, sizeof(cb)); VkDescriptorSetLayout descriptorSetLayout{}; @@ -1301,12 +1287,12 @@ void mean(VulkanTensor& output, const VulkanTensor& input) { WorkGroupSize workGroupSize{1, 1, 1}; auto& computeUnit = context().computeUnitFactory().get( - GLSL_SPV(mean), descriptorSetLayout, workGroupSize); + GLSL_SPV(mean2d), descriptorSetLayout, workGroupSize); computeUnit.createCommandBuffer(descriptorSet); auto commandBuffer = computeUnit.commandBuffer(); output.image()->addImageMemoryBarrierToGeneral(commandBuffer); input.image()->addImageMemoryBarrierToShaderRead(commandBuffer); - computeUnit.dispatchCommandBuffer(1, 1, C_4, workGroupSize); + computeUnit.dispatchCommandBuffer(C, N, 1, workGroupSize); computeUnit.endCommandBuffer(); computeUnit.submitAndWaitCommandBuffer(); vkDestroyDescriptorPool(device, descriptorPool, nullptr); diff --git a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl index 1c6f5d98fc21..c2962844e0bc 100644 --- a/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl +++ b/aten/src/ATen/native/vulkan/glsl/conv2d_pw.glsl @@ -17,6 +17,7 @@ layout(set = 0, binding = 4) uniform PRECISION restrict Block ivec2 stride; ivec2 padding; vec2 clamp; + int W; } uBlock; layout(local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; @@ -38,10 +39,37 @@ void main() { const vec4 In = texelFetch(uInput, ivec3(ipos.x, ipos.y, z), 0); const ivec4 kz = block + 4 * z; - sum = fma(In.xxxx, texelFetch(uKernel, ivec3(0, 0, kz.x), 0), sum); - sum = fma(In.yyyy, texelFetch(uKernel, ivec3(0, 0, kz.y), 0), sum); - sum = fma(In.zzzz, texelFetch(uKernel, ivec3(0, 0, kz.z), 0), sum); - sum = fma(In.wwww, texelFetch(uKernel, ivec3(0, 0, kz.w), 0), sum); + const int W = uBlock.W; + + const vec4 val1 = vec4( + texelFetch(uKernel, ivec3((4*kz.x+0)%W, ((4*kz.x+0))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.x+1)%W, ((4*kz.x+1))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.x+2)%W, ((4*kz.x+2))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.x+3)%W, ((4*kz.x+3))/W, 0), 0).x + ); + const vec4 val2 = vec4( + texelFetch(uKernel, ivec3((4*kz.y+0)%W, ((4*kz.y+0))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.y+1)%W, ((4*kz.y+1))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.y+2)%W, ((4*kz.y+2))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.y+3)%W, ((4*kz.y+3))/W, 0), 0).x + ); + const vec4 val3 = vec4( + texelFetch(uKernel, ivec3((4*kz.z+0)%W, ((4*kz.z+0))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.z+1)%W, ((4*kz.z+1))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.z+2)%W, ((4*kz.z+2))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.z+3)%W, ((4*kz.z+3))/W, 0), 0).x + ); + const vec4 val4 = vec4( + texelFetch(uKernel, ivec3((4*kz.w+0)%W, ((4*kz.w+0))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.w+1)%W, ((4*kz.w+1))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.w+2)%W, ((4*kz.w+2))/W, 0), 0).x, + texelFetch(uKernel, ivec3((4*kz.w+3)%W, ((4*kz.w+3))/W, 0), 0).x + ); + + sum = fma(In.xxxx, val1, sum); + sum = fma(In.yyyy, val2, sum); + sum = fma(In.zzzz, val3, sum); + sum = fma(In.wwww, val4, sum); } imageStore( diff --git a/aten/src/ATen/native/vulkan/ops/Clamp.cpp b/aten/src/ATen/native/vulkan/ops/Clamp.cpp index d56bbb2f4f82..6aec84d8b349 100644 --- a/aten/src/ATen/native/vulkan/ops/Clamp.cpp +++ b/aten/src/ATen/native/vulkan/ops/Clamp.cpp @@ -120,11 +120,37 @@ Tensor& clamp_( return self_arg; } +Tensor hardtanh( + const Tensor& self, + const Scalar min, + const Scalar max) { + return ops::clamp(self, min, max); +} + +Tensor& hardtanh_( + Tensor& self, + const Scalar min, + const Scalar max) { + return ops::clamp_(self, min, max); +} + +Tensor relu(const Tensor& self) { + return ops::clamp(self, 0, c10::nullopt); +} + +Tensor& relu_(Tensor& self) { + return ops::clamp_(self, 0, c10::nullopt); +} + #ifdef USE_VULKAN_API TORCH_LIBRARY_IMPL(aten, Vulkan, m) { m.impl("clamp", TORCH_FN(clamp)); m.impl("clamp_", TORCH_FN(clamp_)); + m.impl_UNBOXED("hardtanh", hardtanh); + m.impl_UNBOXED("hardtanh_", hardtanh_); + m.impl_UNBOXED("relu", relu); + m.impl_UNBOXED("relu_", relu_); } #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/native/vulkan/ops/Convolution.cpp b/aten/src/ATen/native/vulkan/ops/Convolution.cpp index 4dd85f004c5e..5bec92abb53d 100644 --- a/aten/src/ATen/native/vulkan/ops/Convolution.cpp +++ b/aten/src/ATen/native/vulkan/ops/Convolution.cpp @@ -494,11 +494,31 @@ void conv2d_pointwise( using namespace api::utils; if (v_output.has_image() && v_input.has_image() && v_weight.has_image()) { + + vTensor v_weight_reshaped{ + context, + {1,1, v_weight.sizes()[0], v_weight.sizes()[1]}, + v_input.options(), + }; + + api::Command::Buffer temp_command_buffer = + api::context()->command().pool.allocate(); + temp_command_buffer.begin(); + + temp_command_buffer.copy( + v_weight.buffer(temp_command_buffer), + v_weight_reshaped.buffer(temp_command_buffer, vTensor::Access::Write) + ); + + temp_command_buffer.end(); + temp_command_buffer.submit(api::context()->gpu().queue); + const struct { int32_t kernel_ic, kernel_oc; int32_t stride_x, stride_y; int32_t padding_x, padding_y; float clamp_x, clamp_y; + int32_t w; } block { safe_downcast(filter[Layout::Filter::input]), safe_downcast(filter[Layout::Filter::output]), @@ -508,6 +528,7 @@ void conv2d_pointwise( safe_downcast(padding[Layout::Parameter::height]), output_min, output_max, + v_weight.sizes()[1], }; context->dispatch( @@ -529,7 +550,7 @@ void conv2d_pointwise( v_input.image(command_buffer), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. - v_weight.image(command_buffer), + v_weight_reshaped.image(command_buffer, vTensor::Access::Read), // Read-only access is implied on const tensors and triggers an async // synchronization if necessary. v_bias.buffer(command_buffer), diff --git a/aten/src/ATen/native/vulkan/ops/Mm.cpp b/aten/src/ATen/native/vulkan/ops/Mm.cpp index f15ef15969aa..185f66226e15 100644 --- a/aten/src/ATen/native/vulkan/ops/Mm.cpp +++ b/aten/src/ATen/native/vulkan/ops/Mm.cpp @@ -28,14 +28,26 @@ Tensor addmm( const auto mat1_sizes = mat1.sizes(); const auto mat2_sizes = mat2.sizes(); - TORCH_CHECK( - (mat1_sizes[Layout::Parameter::width] == - mat2_sizes[Layout::Parameter::height]) && - (self_sizes[Layout::Parameter::height] == - mat1_sizes[Layout::Parameter::height]) && - (self_sizes[Layout::Parameter::width] == - mat2_sizes[Layout::Parameter::width]), - "Incompatible matrix dimensions!"); + if (self_sizes.size() >= 2) { + TORCH_CHECK( + (mat1_sizes[Layout::Parameter::width] == + mat2_sizes[Layout::Parameter::height]) && + (self_sizes[Layout::Parameter::height] == + mat1_sizes[Layout::Parameter::height]) && + (self_sizes[Layout::Parameter::width] == + mat2_sizes[Layout::Parameter::width]), + "Incompatible matrix dimensions!"); + } + else { + TORCH_CHECK( + (mat1_sizes[Layout::Parameter::width] == + mat2_sizes[Layout::Parameter::height]) && + ((self_sizes[Layout::Parameter::height] == + mat1_sizes[Layout::Parameter::height]) || + (self_sizes[Layout::Parameter::height] == + mat2_sizes[Layout::Parameter::width])), + "Incompatible matrix dimensions!"); + } vTensor v_output{ context, diff --git a/aten/src/ATen/test/vec256_test_all_types.h b/aten/src/ATen/test/vec256_test_all_types.h index 3226af8422d1..353f1e2c2b58 100644 --- a/aten/src/ATen/test/vec256_test_all_types.h +++ b/aten/src/ATen/test/vec256_test_all_types.h @@ -1211,22 +1211,7 @@ std::enable_if_t::value, T> local_sqrt(T x) { template std::enable_if_t>::value, Complex> local_sqrt(Complex x) { -#if defined(TEST_AGAINST_DEFAULT) return std::sqrt(x); -#else - PreventFma noFma; - // sqrt(2) / 2 * [sqrt(abs() + a) + sgn(b) * sqrt(abs() - a)i] - T real = x.real(); - T imag = x.imag(); - T abs = local_abs(x).real(); - T sqrt2_2 = std::sqrt(static_cast(2)) / static_cast(2); - T abs_r = noFma.add(abs, real); - T abs_i = noFma.sub(abs, real); - T res_r = sqrt2_2 * std::sqrt(abs_r); - T res_i = sqrt2_2 * std::sqrt(abs_i); - if (std::signbit(imag)) res_i = -res_i; - return Complex(res_r, res_i); -#endif } template @@ -1236,26 +1221,7 @@ std::enable_if_t::value, T> local_asin(T x) { template std::enable_if_t>::value, Complex> local_asin(Complex x) { -#if defined(TEST_AGAINST_DEFAULT) return std::asin(x); -#else - // asin(x) - // = -i*ln(iz + sqrt(1 -z^2)) - // = -i*ln((ai - b) + sqrt(1 - (a + bi)*(a + bi))) - // = -i*ln((-b + ai) + sqrt(1 - (a**2 - b**2) - 2*abi)) - PreventFma noFma; - T a = x.real(); - T b = x.imag(); - T aa = a * a; - T bb = b * b; - T _ab = a * (-b); - T _2ab = noFma.add(_ab, _ab); - T aa_bb = static_cast(1) - noFma.sub(aa, bb); // 1 - (a*a-b*b) - Complex temp = Complex(-b, a) + local_sqrt(Complex(aa_bb, _2ab)); - auto ln = std::log(temp); - //-i*ln() => -i * ln => (ln.imag, -ln.real) - return Complex(ln.imag(), -ln.real()); -#endif } template @@ -1265,13 +1231,7 @@ std::enable_if_t::value, T> local_acos(T x) { template std::enable_if_t>::value, Complex> local_acos(Complex x) { -#if defined(TEST_AGAINST_DEFAULT) return std::acos(x); -#else - // pi/2 - asin(x) - auto half_pi = static_cast(M_PI) / static_cast(2); - return Complex(half_pi, 0) - local_asin(x); -#endif } template diff --git a/aten/src/ATen/test/vulkan_api_test.cpp b/aten/src/ATen/test/vulkan_api_test.cpp index 69894a9fde5d..03105eec90ea 100644 --- a/aten/src/ATen/test/vulkan_api_test.cpp +++ b/aten/src/ATen/test/vulkan_api_test.cpp @@ -164,6 +164,33 @@ TEST(VulkanAPITest, addmm) { ASSERT_TRUE(check); } +TEST(VulkanAPITest, addmm_expand) { + if (!at::is_vulkan_available()) { + return; + } + + constexpr float alpha = 2.1f; + constexpr float beta = 103.24; + + const auto bias_cpu = at::rand({1000}, at::device(at::kCPU).dtype(at::kFloat)); + const auto m1_cpu = at::rand({1, 1280}, at::device(at::kCPU).dtype(at::kFloat)); + const auto m2_cpu = at::rand({1280, 1000}, at::device(at::kCPU).dtype(at::kFloat)); + const auto out_cpu = at::addmm(bias_cpu, m1_cpu, m2_cpu, beta, alpha); + + const auto bias_vulkan = bias_cpu.vulkan(); + const auto m1_vulkan = m1_cpu.vulkan(); + const auto m2_vulkan = m2_cpu.vulkan(); + const auto out_vulkan = at::addmm(bias_vulkan, m1_vulkan, m2_vulkan, beta, alpha); + + const auto check = almostEqual(out_cpu, out_vulkan.cpu()); + if (!check) { + std::cout << "Expected:\n" << out_cpu << std::endl; + std::cout << "Got:\n" << out_vulkan.cpu() << std::endl; + } + + ASSERT_TRUE(check); +} + TEST(VulkanAPITest, avg_pool2d) { if (!at::is_vulkan_available()) { return; @@ -634,6 +661,268 @@ TEST(VulkanAPITest, upsample_nearest2d) { ASSERT_TRUE(check); } +enum class OpType { + addmm, + conv2d, + hardtanh_, + mean, + }; + +class BaseOp { + public: + explicit BaseOp(const OpType type) : type_(type) {} + virtual ~BaseOp() = default; + + virtual at::Tensor run(at::Tensor&) const = 0; + virtual std::string toString() const = 0; + + private: + OpType type_; +}; + +class Addmm final : public BaseOp { + public: + Addmm( + const int64_t m1H, + const int64_t m1W, + const int64_t m2W, + const float beta, + const float alpha) + : BaseOp(OpType::addmm), + m2_(at::rand(c10::IntArrayRef({m1W, m2W}), at::device(at::kCPU).dtype(at::kFloat))), + v_m2(m2_.vulkan()), + b_(at::rand(c10::IntArrayRef({m1H, m2W}), at::device(at::kCPU).dtype(at::kFloat))), + v_b_(b_.vulkan()), + beta_(beta), + alpha_(alpha) { + } + + at::Tensor run(at::Tensor& t) const override { + if (t.is_vulkan()) { + return at::addmm(v_b_, t, v_m2, beta_, alpha_); + } + + return at::addmm(b_, t, m2_, beta_, alpha_); + } + + std::string toString() const override { + return "addmm"; + } + + private: + at::Tensor m2_; + at::Tensor v_m2; + at::Tensor b_; + at::Tensor v_b_; + float beta_; + float alpha_; +}; + +class Conv2d final : public BaseOp { + public: + Conv2d( + const c10::IntArrayRef wsizes, + const int64_t groups, + const int64_t stride, + const int64_t padding) + : BaseOp(OpType::conv2d), + groups_(groups), + stride_(stride), + padding_(padding), + w_(at::rand(wsizes, at::device(at::kCPU).dtype(at::kFloat))), + b_(at::zeros(wsizes[0], at::device(at::kCPU).dtype(at::kFloat))){ + } + + at::Tensor run(at::Tensor& t) const override { + return at::conv2d(t, w_, b_, {stride_}, {padding_}, {1}, groups_); + } + + std::string toString() const override { + return "conv2d"; + } + + private: + int64_t groups_; + int64_t stride_; + int64_t padding_; + at::Tensor w_; + at::Tensor b_; +}; + +class Hardtanh_ final : public BaseOp { + public: + Hardtanh_() : BaseOp(OpType::hardtanh_) {} + + at::Tensor run(at::Tensor& input) const override { + return at::hardtanh_(input, 0, 6); + } + + std::string toString() const override { + return "hardtanh_"; + } +}; + +class Mean final : public BaseOp { + public: + Mean() : BaseOp(OpType::mean) {} + + at::Tensor run(at::Tensor& input) const override { + return at::mean(input, {2, 3}, false); + } + + std::string toString() const override { + return "mean"; + } +}; + +class OpsList { + public: + OpsList() {} + explicit OpsList(std::vector> ops) + : ops_(std::move(ops)) { + } + + auto run(const at::Tensor& input) { + at::Tensor output = input; + + for (const auto& op : ops_) { + output = op->run(output); + } + + return output; + } + + auto run(const at::Tensor& input, const at::Tensor& v_input) { + at::Tensor output = input; + at::Tensor v_output = v_input; + + for (const auto& op : ops_) { + output = op->run(output); + v_output = op->run(v_output); + } + + return std::make_pair(output, v_output); + } + + protected: + std::vector> ops_; +}; + +class MobileNetV2 final : public OpsList { + public: + MobileNetV2() { + ops_.emplace_back(new Conv2d({32, 3, 3, 3}, 1, 2, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({32, 1, 3, 3}, 32, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({16, 32, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({96, 16, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({96, 1, 3, 3}, 96, 2, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({24, 96, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({24, 144, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({144, 24, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({144, 1, 3, 3}, 144, 2, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({32, 144, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({32, 192, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({192, 32, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({192, 1, 3, 3}, 192, 2, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({64, 192, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({64, 384, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({384, 64, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({384, 1, 3, 3}, 384, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({96, 384, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({96, 576, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({576, 96, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({576, 1, 3, 3}, 576, 2, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({160, 576, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({160, 960, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({960, 160, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({960, 1, 3, 3}, 960, 1, 1)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Conv2d({320, 960, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Conv2d({1280, 320, 1, 1}, 1, 1, 0)); + ops_.emplace_back(new Hardtanh_()); + ops_.emplace_back(new Mean()); + ops_.emplace_back(new Addmm(1, 1280, 1000, 0, 1)); + } +}; + +TEST(VulkanAPITest, mobilenetv2) { + if (!at::is_vulkan_available()) { + return; + } + + MobileNetV2 mn2; + + const auto input = at::rand({1, 3, 224, 224}, at::device(at::kCPU).dtype(at::kFloat)); + const auto output = mn2.run(input, input.vulkan()); + + const auto check = almostEqual(output.first, output.second.cpu()); + if (!check) { + std::cout << "Expected:\n" << output.first << std::endl; + std::cout << "Got:\n" << output.second.cpu() << std::endl; + } + + ASSERT_TRUE(check); +} + } // namespace #endif /* USE_VULKAN_API */ diff --git a/aten/src/ATen/test/vulkan_test.cpp b/aten/src/ATen/test/vulkan_test.cpp index c8d1b72cc06b..d5483a7327b1 100644 --- a/aten/src/ATen/test/vulkan_test.cpp +++ b/aten/src/ATen/test/vulkan_test.cpp @@ -45,7 +45,12 @@ TEST(VulkanTest, upsampleNearest2D) { auto t_out = tv_out.to(at::TensorOptions{at::Device{at::kCPU}}.dtype(at::kFloat)); - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); + bool check = almostEqual(t_out_expected, t_out); + if (!check) { + std::cout << "expected:\n" << t_out_expected << std::endl; + std::cout << "got:\n" << t_out << std::endl; + } + ASSERT_TRUE(check); } TEST(VulkanTest, add) { @@ -208,7 +213,12 @@ TEST(VulkanTest, conv2dDWWeightsOnCPU) { auto tv_in = t_in.vulkan(); auto tv_out = at::conv2d(tv_in, t_w, t_b, stride, padding, dilation, groups); auto t_out = tv_out.cpu(); - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); + bool check = almostEqual(t_out_expected, t_out); + if (!check) { + std::cout << "expected:\n" << t_out_expected << std::endl; + std::cout << "got:\n" << t_out << std::endl; + } + ASSERT_TRUE(check); } TEST(VulkanTest, addmm) { @@ -227,7 +237,12 @@ TEST(VulkanTest, addmm) { auto tv_b = t_b.vulkan(); auto tv_out = at::addmm(tv_b, tv_m1, tv_m2, beta, alpha); auto t_out = tv_out.cpu(); - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); + bool check = almostEqual(t_out_expected, t_out); + if (!check) { + std::cout << "expected:\n" << t_out_expected << std::endl; + std::cout << "got:\n" << t_out << std::endl; + } + ASSERT_TRUE(check); } TEST(VulkanTest, mm) { @@ -242,7 +257,12 @@ TEST(VulkanTest, mm) { auto tv_m2 = t_m2.vulkan(); auto tv_out = tv_m1.mm(tv_m2); auto t_out = tv_out.cpu(); - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); + bool check = almostEqual(t_out_expected, t_out); + if (!check) { + std::cout << "expected:\n" << t_out_expected << std::endl; + std::cout << "got:\n" << t_out << std::endl; + } + ASSERT_TRUE(check); } TEST(VulkanTest, clamp) { @@ -301,7 +321,12 @@ TEST(VulkanTest, mean) { auto tv_in = t_in.vulkan(); auto tv_out = at::mean(tv_in, {2, 3}, false); auto t_out = tv_out.cpu(); - ASSERT_TRUE(almostEqual(t_out, t_out_expected)); + bool check = almostEqual(t_out_expected, t_out); + if (!check) { + std::cout << "expected:\n" << t_out_expected << std::endl; + std::cout << "got:\n" << t_out << std::endl; + } + ASSERT_TRUE(check); } enum class OpType { conv2d, hardtanh_, mean, addmm }; @@ -874,7 +899,7 @@ TEST(VulkanTest, cat) { ASSERT_TRUE(check); } -TEST(VulkanTest, max_pool2d) { +TEST(VulkanTest, DISABLED_max_pool2d) { if (!at::is_vulkan_available()) return; diff --git a/benchmarks/operator_benchmark/benchmark_caffe2.py b/benchmarks/operator_benchmark/benchmark_caffe2.py index 495853e21c34..de56e00fa225 100644 --- a/benchmarks/operator_benchmark/benchmark_caffe2.py +++ b/benchmarks/operator_benchmark/benchmark_caffe2.py @@ -107,7 +107,7 @@ def __init__(self, op_bench, test_config): self.test_config = test_config self.framework = "Caffe2" - def run_forward(self, num_runs, print_per_iter=False, cubda_sync=False): + def run_forward(self, num_runs, print_per_iter=False, cuda_sync=False): """ Run the forward path of an operator in a loop """ with core.DeviceScope(self.op_bench.dev): @@ -115,7 +115,7 @@ def run_forward(self, num_runs, print_per_iter=False, cubda_sync=False): if not workspace.RunOperatorMultiple(op, num_runs): raise ValueError("Unable to run operator test case: {}".format(self.test_name)) - def run_backward(self, num_runs): + def run_backward(self, num_runs, print_per_iter=False): """ Run the backward path of an operator in a loop """ with core.DeviceScope(self.op_bench.dev): diff --git a/benchmarks/operator_benchmark/pt/tensor_to_test.py b/benchmarks/operator_benchmark/pt/tensor_to_test.py new file mode 100644 index 000000000000..7f4c440c2c39 --- /dev/null +++ b/benchmarks/operator_benchmark/pt/tensor_to_test.py @@ -0,0 +1,39 @@ +import operator_benchmark as op_bench +import torch + +tensor_conversion_short_configs = op_bench.cross_product_configs( + M=(8, 16, 32,), + N=(16, 64, 128,), + device=['cpu', 'cuda'], + tags=['short'], +) + +tensor_conversion_long_configs = op_bench.cross_product_configs( + M=(64, 128, 256, 512,), + N=(256, 512, 1024, 2048,), + device=['cpu', 'cuda'], + tags=['long'], +) + +class FloatToHalfTensorConversionBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, device): + self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.float) + + def forward(self): + return self.input.to(torch.half) + +class HalfToFloatTensorConversionBenchmark(op_bench.TorchBenchmarkBase): + def init(self, M, N, device): + self.input = torch.rand(M, N, device=device, requires_grad=False, dtype=torch.half) + + def forward(self): + return self.input.to(torch.float) + + +op_bench.generate_pt_test(tensor_conversion_short_configs, FloatToHalfTensorConversionBenchmark) +op_bench.generate_pt_test(tensor_conversion_long_configs, FloatToHalfTensorConversionBenchmark) +op_bench.generate_pt_test(tensor_conversion_short_configs, HalfToFloatTensorConversionBenchmark) +op_bench.generate_pt_test(tensor_conversion_long_configs, HalfToFloatTensorConversionBenchmark) + +if __name__ == "__main__": + op_bench.benchmark_runner.main() diff --git a/c10/test/util/Metaprogramming_test.cpp b/c10/test/util/Metaprogramming_test.cpp index 0f55814bf6f5..88c8e0facad1 100644 --- a/c10/test/util/Metaprogramming_test.cpp +++ b/c10/test/util/Metaprogramming_test.cpp @@ -243,14 +243,36 @@ namespace test_tuple_take { TEST(MetaprogrammingTest, TupleTake_nonemptyPrefix) { auto x = std::make_tuple(0, "HEY", 2.0); - auto y = tuple_take, 2>(x); + auto y = tuple_take(x); auto z = std::make_tuple(0, "HEY"); EXPECT_EQ(y, z); } TEST(MetaprogrammingTest, TupleTake_fullPrefix) { auto x = std::make_tuple(0, "HEY", 2.0); - auto y = tuple_take, 3>(x); + auto y = tuple_take(x); + EXPECT_EQ(x, y); + } + + TEST(MetaprogrammingTest, TupleTake_negative) { + auto x = std::make_tuple(0, "HEY", 2.0); + auto y = tuple_take(x); + auto z = std::make_tuple("HEY", 2.0); + EXPECT_EQ(y, z); + } +} + +namespace test_tuple_slice { + TEST(MetaprogrammingTest, TupleSlice_middle) { + auto x = std::make_tuple(0, "HEY", 2.0, false); + auto y = tuple_slice(x); + auto z = std::make_tuple("HEY", 2.0); + EXPECT_EQ(y, z); + } + + TEST(MetaprogrammingTest, TupleSlice_full) { + auto x = std::make_tuple(0, "HEY", 2.0); + auto y = tuple_slice(x); EXPECT_EQ(x, y); } } diff --git a/c10/util/C++17.h b/c10/util/C++17.h index 9329ab3b854c..cc11122e8af7 100644 --- a/c10/util/C++17.h +++ b/c10/util/C++17.h @@ -258,6 +258,11 @@ struct _if_constexpr final { * Note: In Example 3, both branches return int, so func() returns int. This is not necessary. * If func() had a return type of "auto", then both branches could return different * types, say func() could return int and func() could return string. + * + * Note: if_constexpr is *eager* w.r.t. template expansion - meaning this + * polyfill does not behave like a true "if statement at compilation time". + * The `_` trick above only defers typechecking, which happens after templates + * have been expanded. (Of course this is all that's necessary for many use cases). */ template decltype(auto) if_constexpr(ThenCallback&& thenCallback, ElseCallback&& elseCallback) { diff --git a/c10/util/Metaprogramming.h b/c10/util/Metaprogramming.h index ee5252097377..ae929a93ca09 100644 --- a/c10/util/Metaprogramming.h +++ b/c10/util/Metaprogramming.h @@ -130,6 +130,30 @@ decltype(auto) filter_map(const Mapper& mapper, Args&&... args) { } +/** + * make_offset_index_sequence + * Like make_index_sequence, but starting from Start instead of 0. + * + * Example: + * make_offset_index_sequence<10, 3> == std::index_sequence<10, 11, 12> + */ +template +struct make_offset_index_sequence_impl + : make_offset_index_sequence_impl +{ + static_assert(static_cast(Start) >= 0, "make_offset_index_sequence: Start < 0"); + static_assert(static_cast(N) >= 0, "make_offset_index_sequence: N < 0"); +}; + +template +struct make_offset_index_sequence_impl { + typedef std::index_sequence type; +}; + +template +using make_offset_index_sequence = typename make_offset_index_sequence_impl::type; + + /** * Use tuple_elements to extract a position-indexed subset of elements * from the argument tuple into a result tuple. @@ -138,22 +162,58 @@ decltype(auto) filter_map(const Mapper& mapper, Args&&... args) { * std::tuple t = std::make_tuple(0, "HEY", 2.0); * std::tuple result = tuple_elements(t, std::index_sequence<0, 2>()); */ -template -constexpr auto tuple_elements(Tuple t, std::index_sequence) { - return std::tuple...>(std::get(t)...); +template +constexpr auto tuple_elements(Tuple t, std::index_sequence) { + return std::tuple...>(std::get(t)...); } /** - * Use tuple_take to extract the first n elements from the argument tuple - * into a result tuple. + * Use tuple_take to extract the first or last n elements from the argument + * tuple into a result tuple. * * Example: * std::tuple t = std::make_tuple(0, "HEY", 2.0); - * std::tuple result = tuple_take(t); + * std::tuple first_two = tuple_take(t); + * std::tuple last_two = tuple_take(t); + */ +template +struct TupleTake {}; + +template +struct TupleTake= 0, void>> { + static auto call(Tuple t) { + constexpr size_t size = std::tuple_size(); + static_assert(N <= size, "tuple_take: N > size"); + return tuple_elements(t, std::make_index_sequence{}); + } +}; + +template +struct TupleTake> { + static auto call(Tuple t) { + constexpr size_t size = std::tuple_size(); + static_assert(-N <= size, "tuple_take: -N > size"); + return tuple_elements(t, make_offset_index_sequence{}); + } +}; + +template +auto tuple_take(Tuple t) { + return TupleTake::call(t); +} + +/** + * Use tuple_slice to extract a contiguous subtuple from the argument. + * + * Example: + * std::tuple t = std::make_tuple(0, "HEY", 2.0, false); + * std::tuple middle_two = tuple_slice(t); */ -template -constexpr auto tuple_take(Tuple t) { - return tuple_elements(t, std::make_index_sequence{}); +template +constexpr auto tuple_slice(Tuple t) { + constexpr size_t size = std::tuple_size(); + static_assert(Start + N <= size, "tuple_slice: Start + N > size"); + return tuple_elements(t, make_offset_index_sequence{}); } diff --git a/c10/util/intrusive_ptr.h b/c10/util/intrusive_ptr.h index 453196510aa8..825b934852d4 100644 --- a/c10/util/intrusive_ptr.h +++ b/c10/util/intrusive_ptr.h @@ -5,6 +5,11 @@ #include #include +namespace pybind11 { +template +class class_; +} + namespace c10 { class intrusive_ptr_target; namespace raw { @@ -14,6 +19,9 @@ namespace raw { namespace intrusive_ptr { inline void incref(intrusive_ptr_target * self); } + + // constructor tag used by intrusive_ptr constructors + struct DontIncreaseRefcount {}; } /** * intrusive_ptr is an alternative to shared_ptr that has better @@ -182,6 +190,16 @@ class intrusive_ptr final { friend class intrusive_ptr; friend class weak_intrusive_ptr; + // Make pybind11::class_ be a friend class of intrusive_ptr, so that custom + // smart holder in pybind11 could access the private constructor of + // intrusive_ptr(T*) which took the ownership of the object. This is required + // by customer holder macro PYBIND11_DECLARE_HOLDER_TYPE, where it uses + // intrusive_ptr(TTarget*) to initialize and take ownership of the object. For + // details, see + // https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers + template + friend class pybind11::class_; + void retain_() { if (target_ != NullType::singleton()) { size_t new_refcount = ++target_->refcount_; @@ -207,16 +225,37 @@ class intrusive_ptr final { target_ = NullType::singleton(); } + // raw pointer constructors are not public because we shouldn't make + // intrusive_ptr out of raw pointers except from inside the make_intrusive(), + // reclaim() and weak_intrusive_ptr::lock() implementations. + // This constructor will not increase the ref counter for you. - // This is not public because we shouldn't make intrusive_ptr out of raw - // pointers except from inside the make_intrusive() and - // weak_intrusive_ptr::lock() implementations - explicit intrusive_ptr(TTarget* target) noexcept : target_(target) {} + // We use the tagged dispatch mechanism to explicitly mark this constructor + // to not increase the refcount + explicit intrusive_ptr(TTarget* target, raw::DontIncreaseRefcount) noexcept + : target_(target) {} + + // This constructor will increase the ref counter for you. + // This constructor will be used by the make_intrusive(), and also pybind11, + // which wrap the intrusive_ptr holder around the raw pointer and incref + // correspondingly (pybind11 requires raw pointer constructor to incref by + // default). + explicit intrusive_ptr(TTarget* target) + : intrusive_ptr(target, raw::DontIncreaseRefcount{}) { + if (target_ != NullType::singleton()) { + // We can't use retain_(), because we also have to increase weakcount + // and because we allow raising these values from 0, which retain_() + // has an assertion against. + ++target_->refcount_; + ++target_->weakcount_; + } + } public: using element_type = TTarget; - intrusive_ptr() noexcept : intrusive_ptr(NullType::singleton()) {} + intrusive_ptr() noexcept + : intrusive_ptr(NullType::singleton(), raw::DontIncreaseRefcount{}) {} intrusive_ptr(intrusive_ptr&& rhs) noexcept : target_(rhs.target_) { rhs.target_ = NullType::singleton(); @@ -347,19 +386,17 @@ class intrusive_ptr final { * passed in *must* have been created using intrusive_ptr::release(). */ static intrusive_ptr reclaim(TTarget* owning_ptr) { - return intrusive_ptr(owning_ptr); + return intrusive_ptr(owning_ptr, raw::DontIncreaseRefcount{}); } + /** + * Allocate a heap object with args and wrap it inside a intrusive_ptr and + * incref. This is a helper function to let make_intrusive() access private + * intrusive_ptr constructors. + */ template static intrusive_ptr make(Args&&... args) { - auto result = intrusive_ptr(new TTarget(std::forward(args)...)); - // We can't use retain_(), because we also have to increase weakcount - // and because we allow raising these values from 0, which retain_() - // has an assertion against. - ++result.target_->refcount_; - ++result.target_->weakcount_; - - return result; + return intrusive_ptr(new TTarget(std::forward(args)...)); } /** @@ -590,17 +627,18 @@ class weak_intrusive_ptr final { intrusive_ptr lock() const noexcept { if (expired()) { - return intrusive_ptr(NullType::singleton()); + return intrusive_ptr(); } else { auto refcount = target_->refcount_.load(); do { if (refcount == 0) { // Object already destructed, no strong references left anymore. // Return nullptr. - return intrusive_ptr(NullType::singleton()); + return intrusive_ptr(); } } while (!target_->refcount_.compare_exchange_weak(refcount, refcount + 1)); - return intrusive_ptr(target_); + return intrusive_ptr( + target_, raw::DontIncreaseRefcount{}); } } diff --git a/test/backward_compatibility/check_backward_compatibility.py b/test/backward_compatibility/check_backward_compatibility.py index 4d22c27e3c7f..06ef9001c40a 100644 --- a/test/backward_compatibility/check_backward_compatibility.py +++ b/test/backward_compatibility/check_backward_compatibility.py @@ -133,6 +133,50 @@ ("aten::add_relu", datetime.date(2020, 10, 28)), ("aten::add_relu_", datetime.date(2020, 10, 28)), ("aten::hash", datetime.date(2020, 11, 15)), + ("aten::_foreach_log", datetime.date(2020, 11, 15)), + ("aten::_foreach_round", datetime.date(2020, 11, 15)), + ("aten::_foreach_sinh", datetime.date(2020, 11, 15)), + ("aten::_foreach_lgamma_", datetime.date(2020, 11, 15)), + ("aten::_foreach_lgamma", datetime.date(2020, 11, 15)), + ("aten::_foreach_log10", datetime.date(2020, 11, 15)), + ("aten::_foreach_round", datetime.date(2020, 11, 15)), + ("aten::_foreach_sin", datetime.date(2020, 11, 15)), + ("aten::_foreach_sinh_", datetime.date(2020, 11, 15)), + ("aten::_foreach_tanh_", datetime.date(2020, 11, 15)), + ("aten::_foreach_abs_", datetime.date(2020, 11, 15)), + ("aten::_foreach_sin_", datetime.date(2020, 11, 15)), + ("aten::_foreach_tan", datetime.date(2020, 11, 15)), + ("aten::_foreach_tan_", datetime.date(2020, 11, 15)), + ("aten::_foreach_log2_", datetime.date(2020, 11, 15)), + ("aten::_foreach_tanh", datetime.date(2020, 11, 15)), + ("aten::_foreach_log_", datetime.date(2020, 11, 15)), + ("aten::_foreach_log10_", datetime.date(2020, 11, 15)), + ("aten::_foreach_neg_", datetime.date(2020, 11, 15)), + ("aten::_foreach_log2", datetime.date(2020, 11, 15)), + ("aten::_foreach_log1p_", datetime.date(2020, 11, 15)), + ("aten::_foreach_abs", datetime.date(2020, 11, 15)), + ("aten::_foreach_acos", datetime.date(2020, 11, 15)), + ("aten::_foreach_acos_", datetime.date(2020, 11, 15)), + ("aten::_foreach_asin", datetime.date(2020, 11, 15)), + ("aten::_foreach_asin_", datetime.date(2020, 11, 15)), + ("aten::_foreach_atan", datetime.date(2020, 11, 15)), + ("aten::_foreach_atan_", datetime.date(2020, 11, 15)), + ("aten::_foreach_ceil", datetime.date(2020, 11, 15)), + ("aten::_foreach_ceil_", datetime.date(2020, 11, 15)), + ("aten::_foreach_cos", datetime.date(2020, 11, 15)), + ("aten::_foreach_cos_", datetime.date(2020, 11, 15)), + ("aten::_foreach_cosh", datetime.date(2020, 11, 15)), + ("aten::_foreach_cosh_", datetime.date(2020, 11, 15)), + ("aten::_foreach_erf", datetime.date(2020, 11, 15)), + ("aten::_foreach_erf_", datetime.date(2020, 11, 15)), + ("aten::_foreach_erfc", datetime.date(2020, 11, 15)), + ("aten::_foreach_erfc_", datetime.date(2020, 11, 15)), + ("aten::_foreach_expm1", datetime.date(2020, 11, 15)), + ("aten::_foreach_expm1_", datetime.date(2020, 11, 15)), + ("aten::_foreach_floor", datetime.date(2020, 11, 15)), + ("aten::_foreach_floor_", datetime.date(2020, 11, 15)), + ("aten::_foreach_log1p", datetime.date(2020, 11, 15)), + ("aten::_foreach_neg", datetime.date(2020, 11, 15)), ] def allow_listed(schema, allow_list): diff --git a/test/cpp/rpc/e2e_test_base.h b/test/cpp/rpc/e2e_test_base.h index 9d3ab71c0cfc..114284839858 100644 --- a/test/cpp/rpc/e2e_test_base.h +++ b/test/cpp/rpc/e2e_test_base.h @@ -28,7 +28,7 @@ class TestE2EBase : public ::testing::Test { autogradContainer = getDistAutogradContainer(); // Setup server store. - store = std::make_shared( + store = c10::make_intrusive( serverAddress, 0, numWorkers, true, std::chrono::seconds(10)); buildRpcAgent(); @@ -147,7 +147,7 @@ class TestE2EBase : public ::testing::Test { std::shared_ptr rpcAgent; static const size_t numIters; static const size_t numWorkers; - std::shared_ptr store; + c10::intrusive_ptr store; static const char* serverAddress; }; diff --git a/test/cpp_extensions/cpp_c10d_extension.cpp b/test/cpp_extensions/cpp_c10d_extension.cpp index b4901cdbcf4d..d5ba55a6379c 100644 --- a/test/cpp_extensions/cpp_c10d_extension.cpp +++ b/test/cpp_extensions/cpp_c10d_extension.cpp @@ -23,92 +23,92 @@ ProcessGroupTest::ProcessGroupTest(int rank, int size) ProcessGroupTest::~ProcessGroupTest() {} -std::shared_ptr ProcessGroupTest::broadcast( +c10::intrusive_ptr ProcessGroupTest::broadcast( std::vector& tensors, const BroadcastOptions& opts) { - return std::make_shared(); + return c10::make_intrusive(); } -std::shared_ptr ProcessGroupTest::allreduce( +c10::intrusive_ptr ProcessGroupTest::allreduce( std::vector& tensors, const AllreduceOptions& opts) { - return std::make_shared(); + return c10::make_intrusive(); } -std::shared_ptr ProcessGroupTest::allreduce_coalesced( +c10::intrusive_ptr ProcessGroupTest::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support allreduce_coalesced"); } -std::shared_ptr ProcessGroupTest::reduce( +c10::intrusive_ptr ProcessGroupTest::reduce( std::vector& tensors, const ReduceOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support reduce"); } -std::shared_ptr ProcessGroupTest::allgather( +c10::intrusive_ptr ProcessGroupTest::allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support allgather"); } -std::shared_ptr ProcessGroupTest::allgather_base( +c10::intrusive_ptr ProcessGroupTest::allgather_base( at::Tensor& outputBuffer, at::Tensor& inputBuffer, const AllgatherOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support allgather_base"); } -std::shared_ptr ProcessGroupTest::barrier( +c10::intrusive_ptr ProcessGroupTest::barrier( const BarrierOptions& opts) { - return std::make_shared(); + return c10::make_intrusive(); } -std::shared_ptr ProcessGroupTest::gather( +c10::intrusive_ptr ProcessGroupTest::gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support gather"); } -std::shared_ptr ProcessGroupTest::scatter( +c10::intrusive_ptr ProcessGroupTest::scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support scatter"); } -std::shared_ptr ProcessGroupTest::reduce_scatter( +c10::intrusive_ptr ProcessGroupTest::reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts) { throw std::runtime_error("ProcessGroupTest does not support reduce_scatter"); } -std::shared_ptr ProcessGroupTest::send( +c10::intrusive_ptr ProcessGroupTest::send( std::vector& tensors, int dstRank, int tag) { throw std::runtime_error("ProcessGroupTest does not support send"); } -std::shared_ptr ProcessGroupTest::recv( +c10::intrusive_ptr ProcessGroupTest::recv( std::vector& tensors, int srcRank, int tag) { throw std::runtime_error("ProcessGroupTest does not support recv"); } -std::shared_ptr ProcessGroupTest::recvAnysource( +c10::intrusive_ptr ProcessGroupTest::recvAnysource( std::vector& tensor, int tag) { throw std::runtime_error("ProcessGroupTest does not support recvAnysource"); } std::shared_ptr ProcessGroupTest::createProcessGroupTest( - const std::shared_ptr<::c10d::Store>& store, + const c10::intrusive_ptr<::c10d::Store>& store, int rank, int size, const std::chrono::duration& timeout) { diff --git a/test/cpp_extensions/cpp_c10d_extension.hpp b/test/cpp_extensions/cpp_c10d_extension.hpp index d8dffcd20327..1773953629d5 100644 --- a/test/cpp_extensions/cpp_c10d_extension.hpp +++ b/test/cpp_extensions/cpp_c10d_extension.hpp @@ -41,67 +41,67 @@ class ProcessGroupTest : public ProcessGroup { explicit ProcessGroupTest(int rank = -1, int size = -1); virtual ~ProcessGroupTest(); - std::shared_ptr broadcast( + c10::intrusive_ptr broadcast( std::vector& data, const BroadcastOptions& opts = BroadcastOptions()) override; - std::shared_ptr allreduce( + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - std::shared_ptr allreduce_coalesced( + c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override; - std::shared_ptr reduce( + c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) override; - std::shared_ptr allgather( + c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_base( + c10::intrusive_ptr allgather_base( at::Tensor& outputBuffer, at::Tensor& inputBuffer, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr barrier( + c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override; - std::shared_ptr gather( + c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts = GatherOptions()) override; - std::shared_ptr scatter( + c10::intrusive_ptr scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr reduce_scatter( + c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - std::shared_ptr send( + c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag); - std::shared_ptr recv( + c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag); - std::shared_ptr recvAnysource( + c10::intrusive_ptr recvAnysource( std::vector& tensor, int tag); // Create a new ProcessGroupTest instance static std::shared_ptr createProcessGroupTest( - const std::shared_ptr<::c10d::Store>& store, + const c10::intrusive_ptr<::c10d::Store>& store, int rank, int size, const std::chrono::duration& timeout); diff --git a/test/jit/test_recursive_script.py b/test/jit/test_recursive_script.py index bd9a2bb32b89..48e874990a73 100644 --- a/test/jit/test_recursive_script.py +++ b/test/jit/test_recursive_script.py @@ -495,6 +495,50 @@ def forward(self, x): self.checkModule(M(), (torch.randn(5, 5),)) + def test_prepare_scriptable_basic(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + t = torch.randn(5, 5) + m = SeluButReluWhenScripted() + sm = torch.jit.script(m) + eager_out = m(t) + script_out = sm(t) + self.assertNotEqual(eager_out, script_out) + + def test_prepare_scriptable_iterable_modules(self): + class SeluButReluWhenScripted(torch.nn.SELU): + def __prepare_scriptable__(self): + return nn.ReLU() + + class M(torch.nn.Module): + def __init__(self): + super(M, self).__init__() + shared = SeluButReluWhenScripted() + self.sequential = nn.Sequential( + SeluButReluWhenScripted(), + SeluButReluWhenScripted(), + nn.Sequential(SeluButReluWhenScripted(), shared, SeluButReluWhenScripted()), + shared, + ) + self.module_list = nn.ModuleList([SeluButReluWhenScripted(), + shared, + SeluButReluWhenScripted()]) + + def forward(self, x): + for mod in self.module_list: + x += mod(x) + x += self.sequential(x) + return x + + t = torch.randn(5, 5) + m = M() + eager_out = m(t.clone()) + sm = torch.jit.script(m) + script_out = sm(t.clone()) + self.assertNotEqual(eager_out, script_out) + def test_attributes(self): @torch.jit.script class Inner2(object): diff --git a/test/jit/test_torchbind.py b/test/jit/test_torchbind.py index df482403f6c7..c1ca50270197 100644 --- a/test/jit/test_torchbind.py +++ b/test/jit/test_torchbind.py @@ -62,6 +62,32 @@ def f(): return ss1.pop() + ss2.pop() test_equality(f, lambda x: x) + # test nn module with prepare_scriptable function + class NonJitableClass(object): + def __init__(self, int1, int2): + self.int1 = int1 + self.int2 = int2 + + def return_vals(self): + return self.int1, self.int2 + + class CustomWrapper(torch.nn.Module): + def __init__(self, foo): + super(CustomWrapper, self).__init__() + self.foo = foo + + def forward(self) -> None: + self.foo.increment(1) + return + + def __prepare_scriptable__(self): + int1, int2 = self.foo.return_vals() + foo = torch.classes._TorchScriptTesting._Foo(int1, int2) + return CustomWrapper(foo) + + foo = CustomWrapper(NonJitableClass(1, 2)) + jit_foo = torch.jit.script(foo) + def test_torchbind_take_as_arg(self): global StackString # see [local resolution in python] StackString = torch.classes._TorchScriptTesting._StackString @@ -143,6 +169,23 @@ def foo(): scripted = torch.jit.script(foo) self.assertEqual(scripted(), "mom") + def test_torchbind_class_attr_recursive(self): + class FooBar(torch.nn.Module): + def __init__(self, foo_model): + super(FooBar, self).__init__() + self.foo_mod = foo_model + + def forward(self) -> int: + return self.foo_mod.info() + + def to_ivalue(self): + torchbind_model = torch.classes._TorchScriptTesting._Foo(self.foo_mod.info(), 1) + return FooBar(torchbind_model) + + inst = FooBar(torch.classes._TorchScriptTesting._Foo(2, 3)) + scripted = torch.jit.script(inst.to_ivalue()) + self.assertEqual(scripted(), 6) + def test_torchbind_class_attribute(self): class FooBar1234(torch.nn.Module): def __init__(self): diff --git a/test/onnx/test_pytorch_onnx_onnxruntime.py b/test/onnx/test_pytorch_onnx_onnxruntime.py index 34ebc70218b5..64bf20742d15 100644 --- a/test/onnx/test_pytorch_onnx_onnxruntime.py +++ b/test/onnx/test_pytorch_onnx_onnxruntime.py @@ -2498,6 +2498,14 @@ def test_logsoftmax_dim(self): input = torch.randn(3, 4, 5, 6) self.run_test(model, input) + def test_logsoftmax_dtype(self): + class Model(torch.nn.Module): + def forward(self, x): + return torch.nn.functional.log_softmax(x, dim=1, dtype=torch.float64) + + x = torch.randn(3, 4, 5, requires_grad=True) + self.run_test(Model(), x) + @skipIfUnsupportedMinOpsetVersion(9) @disableScriptTest() # scripting prim_dtype def test_lstm_no_hidden(self): @@ -3003,12 +3011,17 @@ def forward(self, input, other): y = torch.randn(6, 4) self.run_test(ViewModel(), (x, y)) - @disableScriptTest() # ONNX Shape inference failure in if/else block for Gemm def test_weight_norm(self): + # addmm for 3-d inputs converts to onnx::MatMul model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1) x = torch.randn(3, 4, 5, requires_grad=True) self.run_test(model, x) + # addmm for 2-d inputs converts to onnx::Gemm + model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=1) + x = torch.randn(4, 5, requires_grad=True) + self.run_test(model, x) + model = torch.nn.utils.weight_norm(torch.nn.Conv1d(1, 1, 3)) x = torch.randn(1, 1, 5, requires_grad=True) self.run_test(model, x) @@ -3021,12 +3034,17 @@ def test_weight_norm(self): x = torch.randn(3, 3, 5, requires_grad=True) self.run_test(model, x) - @disableScriptTest() # ONNX Shape inference failure in if/else block for Gemm def test_weight_norm_nodim(self): + # addmm for 3-d inputs converts to onnx::MatMul model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None) x = torch.randn(3, 4, 5, requires_grad=True) self.run_test(model, x) + # addmm for 2-d inputs converts to onnx::Gemm + model = torch.nn.utils.weight_norm(torch.nn.Linear(5, 10), dim=None) + x = torch.randn(4, 5, requires_grad=True) + self.run_test(model, x) + def test_flatten(self): class FlattenModel(torch.nn.Module): def forward(self, input): @@ -3382,7 +3400,9 @@ def forward(self, x): def test_eye(self): class TensorFactory(torch.nn.Module): def forward(self, x): - return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), torch.eye(x.size()[1], 2, dtype=torch.long) + return torch.eye(x.size()[1], 3), torch.eye(4, 4, dtype=torch.long), \ + torch.eye(x.size()[1], 2, dtype=torch.long), torch.eye(x.shape[0]), \ + torch.eye(x.shape[0], dtype=torch.float64) x = torch.randn(2, 3, 4) another_x = torch.randn(5, 6, 7) @@ -3578,6 +3598,7 @@ def forward(self, x): self.run_test(MaskedSelectModel(), x) @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() # dtype not available def test_index_put_to_masked_fill(self): class MaskedFillModel(torch.nn.Module): def forward(self, input_mask, some_const): @@ -3591,6 +3612,7 @@ def forward(self, input_mask, some_const): self.run_test(MaskedFillModel(), (mask, constant)) @skipIfUnsupportedMinOpsetVersion(11) + @disableScriptTest() # dtype not available def test_index_put_to_masked_scatter(self): class MaskedScatterModel(torch.nn.Module): def forward(self, input_mask, some_const): @@ -3658,7 +3680,6 @@ def forward(self, x): self.run_test(FullModel(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # dtype mismatch def test_full_like(self): class FullLikeModel(torch.nn.Module): def forward(self, x): @@ -3668,7 +3689,6 @@ def forward(self, x): self.run_test(FullLikeModel(), x) @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # dtype mismatch def test_full_like_value(self): class FullLikeModel(torch.nn.Module): def forward(self, x, y): @@ -4317,7 +4337,6 @@ def forward(self, input, target): @skipIfUnsupportedMinOpsetVersion(9) - @disableScriptTest() # Output dtype mismatch def test_kldiv_loss(self): x = torch.randn(5) diff --git a/test/quantization/test_quantize_fx.py b/test/quantization/test_quantize_fx.py index e057e25643a4..16694b0f0356 100644 --- a/test/quantization/test_quantize_fx.py +++ b/test/quantization/test_quantize_fx.py @@ -500,48 +500,52 @@ def forward(self, x): original_ref_m.conv2.bias = torch.nn.Parameter(original_m.standalone.conv.bias.detach()) qconfig_dict = {"": default_qconfig} - prepare_custom_config_dict = {"standalone_module_name": ["standalone"]} - # check prepared model - m = prepare_fx( - original_m, qconfig_dict, prepare_custom_config_dict=prepare_custom_config_dict) - # calibration - m(data) - # input and output of first conv, observer for standalone module - # will be inserted in the standalone module itself - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 2 - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - # for output of conv in the standalone module - count_check = { - ns.call_module(torch.quantization.MinMaxObserver): 1 - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + config_name = {"standalone_module_name": ["standalone"]} + config_class = {"standalone_module_class": [StandaloneModule]} + for prepare_config in [config_name, config_class]: + original_m_copy = copy.deepcopy(original_m) + original_ref_m_copy = copy.deepcopy(original_ref_m) + # check prepared model + m = prepare_fx( + original_m_copy, qconfig_dict, prepare_custom_config_dict=prepare_config) + # calibration + m(data) + # input and output of first conv, observer for standalone module + # will be inserted in the standalone module itself + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 2 + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + # for output of conv in the standalone module + count_check = { + ns.call_module(torch.quantization.MinMaxObserver): 1 + } + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) - # check converted/quantized model - m = convert_fx(m) - count_check = { - ns.call_function(torch.quantize_per_tensor) : 1, - ns.call_module(nnq.Conv2d) : 1, - ns.call_method('dequantize') : 1, - } - self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) - count_check = { - # quantization of input happens in parent module - # quantization of output happens in the quantized conv module - ns.call_function(torch.quantize_per_tensor) : 0, - # dequantization for output happens in parent module - ns.call_method('dequantize') : 0, - } - self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) - res = m(data) - - # quantize the reference model - ref_m = prepare_fx(original_ref_m, qconfig_dict) - ref_m(data) - ref_m = convert_fx(ref_m) - ref_res = ref_m(data) - self.assertEqual(res, ref_res) + # check converted/quantized model + m = convert_fx(m) + count_check = { + ns.call_function(torch.quantize_per_tensor) : 1, + ns.call_module(nnq.Conv2d) : 1, + ns.call_method('dequantize') : 1, + } + self.checkGraphModuleNodes(m, expected_node_occurrence=count_check) + count_check = { + # quantization of input happens in parent module + # quantization of output happens in the quantized conv module + ns.call_function(torch.quantize_per_tensor) : 0, + # dequantization for output happens in parent module + ns.call_method('dequantize') : 0, + } + self.checkGraphModuleNodes(m.standalone, expected_node_occurrence=count_check) + res = m(data) + + # quantize the reference model + ref_m = prepare_fx(original_ref_m_copy, qconfig_dict) + ref_m(data) + ref_m = convert_fx(ref_m) + ref_res = ref_m(data) + self.assertEqual(res, ref_res) @skipIfNoFBGEMM def test_qconfig_none(self): diff --git a/test/quantization/test_workflow_module.py b/test/quantization/test_workflow_module.py index cd722d59d2a2..22751697cd1d 100644 --- a/test/quantization/test_workflow_module.py +++ b/test/quantization/test_workflow_module.py @@ -482,6 +482,29 @@ def test_save_load_state_dict_script(self): # Verify that state_dict matches exactly with original one. self.assertEqual(scripted.state_dict(), scripted_2.state_dict()) + + @unittest.skipIf(not TEST_MULTIGPU, "multi-GPU not supported") + @unittest.skipIf(not TEST_CUDA, "CUDA unavailable") + def test_observer_qparams_respects_device_affinity(self): + """ + Ensure that the scale and zero_point returned by the observer + are on the same device as the input tensor. + """ + observerList = [MinMaxObserver(), + MovingAverageMinMaxObserver(), + PerChannelMinMaxObserver(), + MovingAveragePerChannelMinMaxObserver()] + for obs in observerList: + device = torch.device('cuda:1') + x = torch.randn(1, 2, device=device) + obs.to(device) + result = obs(x) + scale, zero_point = obs.calculate_qparams() + + self.assertEqual(x.device, scale.device) + self.assertEqual(x.device, zero_point.device) + + # HistogramObserver that works like it does on master class _ReferenceHistogramObserver(HistogramObserver): def __init__(self, *args, **kwargs): diff --git a/test/test_autograd.py b/test/test_autograd.py index e651bfe477dd..177a9b4c7805 100644 --- a/test/test_autograd.py +++ b/test/test_autograd.py @@ -3271,7 +3271,7 @@ def test_profiler_aggregation_lstm(self): print(prof.key_averages(group_by_input_shape=True).table( sort_by="self_cpu_time_total", row_limit=10)) print(prof.table( - sort_by="self_cpu_time_total", row_limit=10, header="TEST", top_level_events_only=True)) + sort_by="self_cpu_time_total", row_limit=10, max_src_column_width=300, header="TEST", top_level_events_only=True)) print(prof.key_averages(group_by_input_shape=True).table( sort_by="self_cpu_time_total", row_limit=10, top_level_events_only=True)) diff --git a/test/test_foreach.py b/test/test_foreach.py index e164fd352648..a723efa04684 100644 --- a/test/test_foreach.py +++ b/test/test_foreach.py @@ -28,35 +28,6 @@ class TestForeach(TestCase): torch.div, ] - unary_ops = [ - # foreach_op, foreach_op_, torch_op, bf16, complex64/128 - (torch._foreach_sqrt, torch._foreach_sqrt_, torch.sqrt, True , True), - (torch._foreach_exp, torch._foreach_exp_, torch.exp, True, True), - (torch._foreach_acos, torch._foreach_acos_, torch.acos, False, True), - (torch._foreach_asin, torch._foreach_asin_, torch.asin, False, True), - (torch._foreach_atan, torch._foreach_atan_, torch.atan, False, True), - (torch._foreach_cos, torch._foreach_cos_, torch.cos, True, True), - (torch._foreach_cosh, torch._foreach_cosh_, torch.cosh, False, True), - (torch._foreach_log, torch._foreach_log_, torch.log, True, True), - (torch._foreach_log10, torch._foreach_log10_, torch.log10, True, True), - (torch._foreach_log2, torch._foreach_log2_, torch.log2, True, True), - (torch._foreach_neg, torch._foreach_neg_, torch.neg, True, True), - (torch._foreach_tan, torch._foreach_tan_, torch.tan, False, True), - (torch._foreach_tanh, torch._foreach_tanh_, torch.tanh, True, True), - (torch._foreach_sin, torch._foreach_sin_, torch.sin, False, True), - (torch._foreach_sinh, torch._foreach_sinh_, torch.sinh, False, True), - (torch._foreach_ceil, torch._foreach_ceil_, torch.ceil, False, False), - (torch._foreach_erf, torch._foreach_erf_, torch.erf, True, False), - (torch._foreach_erfc, torch._foreach_erfc_, torch.erfc, False, False), - (torch._foreach_expm1, torch._foreach_expm1_, torch.expm1, False, False), - (torch._foreach_floor, torch._foreach_floor_, torch.floor, False, False), - (torch._foreach_log1p, torch._foreach_log1p_, torch.log1p, True, False), - (torch._foreach_round, torch._foreach_round_, torch.round, False, False), - - # See test_abs - # (torch._foreach_abs, torch._foreach_abs_, torch.abs, True, True), - ] - def _get_test_data(self, device, dtype, N): if dtype in [torch.bfloat16, torch.bool, torch.float16]: tensors = [torch.randn(N, N, device=device).to(dtype) for _ in range(N)] @@ -85,6 +56,21 @@ def _test_bin_op_list(self, device, dtype, foreach_op, foreach_op_, torch_op): else: self.assertEqual(tensors1, expected) + def _test_unary_op(self, device, dtype, foreach_op, foreach_op_, torch_op): + for N in N_values: + tensors1 = self._get_test_data(device, dtype, N) + # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. + control_dtype = torch.float32 if (self.device_type == 'cuda' and + (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype + expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] + res = foreach_op(tensors1) + foreach_op_(tensors1) + self.assertEqual(res, tensors1) + if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: + self.assertEqual(tensors1, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + else: + self.assertEqual(tensors1, expected) + def _test_pointwise_op(self, device, dtype, foreach_op, foreach_op_, torch_op): for N in N_values: values = [2 + i for i in range(N)] @@ -163,106 +149,13 @@ def _test_bin_op_list_alpha(self, device, dtype, foreach_op, foreach_op_, torch_ # # Unary ops # - @dtypes(*(torch.testing.floating_and_complex_types_and(torch.bfloat16, torch.half))) - def test_unary_ops(self, device, dtype): - for fe_op, fe_op_, torch_op, support_bfloat16, support_complex in self.unary_ops: - for N in N_values: - tensors1 = self._get_test_data(device, dtype, N) - # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. - control_dtype = torch.float32 if (self.device_type == 'cuda' and - (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype - - if self.device_type == 'cpu' and dtype == torch.half and torch_op != torch.neg: - with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"): - expected = [torch_op(tensors1[i]) for i in range(N)] - - with self.assertRaisesRegex(RuntimeError, r"not implemented for \'Half\'"): - res = fe_op(tensors1) - break - - if dtype == torch.bfloat16 and not support_bfloat16: - if self.device_type == 'cuda' or torch_op in [torch.sinh, torch.cosh]: - with self.assertRaisesRegex(RuntimeError, r"not implemented for \'BFloat16\'"): - expected = [torch_op(tensors1[i]) for i in range(N)] - - with self.assertRaisesRegex(RuntimeError, r"not implemented for \'BFloat16\'"): - res = fe_op(tensors1) - break - - if dtype in [torch.complex64, torch.complex128] and not support_complex: - # not using assertRaisesRegex due to different error messages - with self.assertRaises(RuntimeError): - expected = [torch_op(tensors1[i]) for i in range(N)] - - with self.assertRaises(RuntimeError): - res = fe_op(tensors1) - break - - expected = [torch_op(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] - res = fe_op(tensors1) - if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: - self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) + @dtypes(*[torch.float, torch.double, torch.complex64, torch.complex128]) + def test_sqrt(self, device, dtype): + self._test_unary_op(device, dtype, torch._foreach_sqrt, torch._foreach_sqrt_, torch.sqrt) - fe_op_(tensors1) - self.assertEqual(res, tensors1) - else: - self.assertEqual(res, expected) - - fe_op_(tensors1) - self.assertEqual(res, tensors1) - - # Separate test for abs due to a lot of special cases - # Absolute value of a complex number a + bj is defined as sqrt(a^2 + b^2), i.e. a floating point - @dtypes(*(torch.testing.floating_and_complex_types_and(torch.bfloat16, torch.half))) - def test_abs(self, device, dtype): - for N in N_values: - tensors1 = self._get_test_data(device, dtype, N) - # Mimics cuda kernel dtype flow. With fp16/bf16 input, runs in fp32 and casts output back to fp16/bf16. - control_dtype = torch.float32 if (self.device_type == 'cuda' and - (dtype is torch.float16 or dtype is torch.bfloat16)) else dtype - - expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(dtype=dtype) for i in range(N)] - res = torch._foreach_abs(tensors1) - if (dtype is torch.float16 or dtype is torch.bfloat16) and TEST_WITH_ROCM: - self.assertEqual(res, expected, atol=1.e-3, rtol=self.dtype_precisions[dtype][0]) - - torch._foreach_abs_(tensors1) - self.assertEqual(res, tensors1) - else: - if self.device_type == 'cpu': - if dtype == torch.complex64: - expected = [torch.abs(tensors1[i].to(dtype=control_dtype)) for i in range(N)] - self.assertEqual(res, expected) - - with self.assertRaisesRegex(RuntimeError, r"In-place abs is not supported for complex tensors."): - torch._foreach_abs_(tensors1) - break - elif dtype == torch.complex128: - expected = [torch.abs(tensors1[i].to(dtype=control_dtype)) for i in range(N)] - self.assertEqual(res, expected) - - with self.assertRaisesRegex(RuntimeError, r"In-place abs is not supported for complex tensors."): - torch._foreach_abs_(tensors1) - break - else: - self.assertEqual(res, expected) - else: - if dtype == torch.complex64: - expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(torch.complex64) for i in range(N)] - self.assertEqual(res, expected) - - torch._foreach_abs_(tensors1) - self.assertEqual(res, tensors1) - break - elif dtype == torch.complex128: - expected = [torch.abs(tensors1[i].to(dtype=control_dtype)).to(torch.complex128) for i in range(N)] - self.assertEqual(res, expected) - - torch._foreach_abs_(tensors1) - self.assertEqual(res, tensors1) - break - else: - self.assertEqual(res, expected) + @dtypes(*[torch.float, torch.double, torch.complex64, torch.complex128]) + def test_exp(self, device, dtype): + self._test_unary_op(device, dtype, torch._foreach_exp, torch._foreach_exp_, torch.exp) # # Pointwise ops diff --git a/test/test_fx.py b/test/test_fx.py index 349941c72f86..dcb104528402 100644 --- a/test/test_fx.py +++ b/test/test_fx.py @@ -1104,6 +1104,54 @@ def forward(self, x): traced = torch.fx.symbolic_trace(Foo()) assert(all('constant' not in node.target for node in traced.graph.nodes)) + def test_single_default_arg(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, y=1): + return y + + m = M() + self.checkGraphModule(m, ()) + self.checkGraphModule(m, (3,)) + + def test_multiple_default_args(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, y=1, z=2): + return y + z + + m = M() + self.checkGraphModule(m, ()) + self.checkGraphModule(m, (3,)) + self.checkGraphModule(m, (3, 4)) + + def test_regular_and_default_args(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self, x, y=1): + return x + y + + m = M() + self.checkGraphModule(m, (2,)) + self.checkGraphModule(m, (2, 3)) + + def test_string_literal_return(self): + class M(torch.nn.Module): + def __init__(self): + super().__init__() + + def forward(self): + return "foo" + + m = M() + self.checkGraphModule(m, ()) + if __name__ == '__main__': run_tests() diff --git a/test/test_fx_experimental.py b/test/test_fx_experimental.py index ab387a12a6ea..9a75663e4205 100644 --- a/test/test_fx_experimental.py +++ b/test/test_fx_experimental.py @@ -1,36 +1,121 @@ import torch +from typing import Dict from torch.fx.symbolic_trace import symbolic_trace from torch.fx.graph_module import GraphModule +from torch.fx.node import Node from torch.fx.experimental import GraphManipulation from torch.fx.experimental.Partitioner import Partitioner, Device, PartitionerConfig from torch.fx.experimental.rewriter import RewritingTracer from torch.testing._internal.common_utils import run_tests from torch.testing._internal.jit_utils import JitTestCase -from torch.fx.experimental.partitioner_utils import NodeLatency, \ - get_partition_to_latency_mapping, get_latency_of_partitioned_graph +from torch.fx.experimental.partitioner_utils import ( + NodeLatency, + get_partition_to_latency_mapping, + get_latency_of_partitioned_graph, +) from typing import Union, Callable + def symbolic_trace_with_rewrite(root: Union[torch.nn.Module, Callable]) -> GraphModule: - return GraphModule(root if isinstance(root, torch.nn.Module) else torch.nn.Module(), RewritingTracer().trace(root)) + return GraphModule( + root if isinstance(root, torch.nn.Module) else torch.nn.Module(), + RewritingTracer().trace(root), + ) + class TestFXExperimental(JitTestCase): + def test_serialize_graph(self): + class TestModule(torch.nn.Module): + def __init__(self): + super().__init__() + self.linear = torch.nn.Linear(4, 4) + self.e = torch.rand(4) + + def forward(self, a, b): + add_1 = a + b + linear = self.linear(add_1) + add_2 = linear + self.e + return add_2 + + m = TestModule() + traced = symbolic_trace(m) + a = torch.rand(4) + b = torch.rand(4) + GraphManipulation.get_size_of_all_nodes(traced, [a, b]) + + partitioner = Partitioner() + devices = [Device("dev_0", 5000, 0), Device("dev_1", 125, 1)] + partitioner_config = PartitionerConfig(devices, is_sparse_nn=True) + ret = partitioner.partition_graph(traced, m, partitioner_config) + module_with_submodules = ret.module_with_submodules + # Fix for now to add type/shape to output + for node in traced.graph.nodes: + if node.op == "output": + node.shape = a.shape + node.dtype = a.dtype + for mod in module_with_submodules.modules(): + if isinstance(mod, GraphModule): + for node in mod.graph.nodes: + node.shape = a.shape + node.dtype = a.dtype + for node in module_with_submodules.graph.nodes: + node.shape = a.shape + node.dtype = a.dtype + + agm1 = GraphManipulation.AcceleratedGraphModule(traced) + agm2 = GraphManipulation.AcceleratedGraphModule(module_with_submodules) + assert len(agm1.weights) == 3 + assert len(agm2.weights) == 3 + assert len(agm1.serialized_graph["nodes"]) == 7 + assert len(agm1.serialized_graph["weights"]) == 3 + assert len(agm1.serialized_graph["modules"]) == 0 + assert len(agm2.serialized_graph["nodes"]) == 5 + assert len(agm2.serialized_graph["weights"]) == 3 + assert len(agm2.serialized_graph["modules"]) == 1 + assert agm1.serialized_graph["weights"]["linear.weight"]["shape"] == "[4, 4]" + assert ( + agm1.serialized_graph["weights"]["linear.weight"]["dtype"] + == "torch.float32" + ) + assert ( + agm1.serialized_graph["weights"]["linear.weight"]["is_quantized"] is False + ) + assert agm1.serialized_graph["nodes"][0]["shape"] == "[4]" + assert agm1.serialized_graph["nodes"][0]["dtype"] == "torch.float32" + assert agm1.serialized_graph["nodes"][0]["target"] == "a" + assert agm1.serialized_graph["nodes"][0]["op_code"] == "placeholder" + assert agm1.serialized_graph["nodes"][0]["name"] == "a" + assert agm1.serialized_graph["nodes"][2]["args"][0]["name"] == "a" + assert agm1.serialized_graph["nodes"][2]["args"][0]["is_node"] is True + + # Test quantization info serialization. + x = torch.tensor([[-1.0, 0.0], [1.0, 2.0]]) + q_tensor = torch.quantize_per_tensor(x, 1, 0, torch.qint32) + q_tensor_channel = torch.quantize_per_channel( + x, torch.tensor([0.1, 0.01]), torch.tensor([10, 0]), 0, torch.quint8 + ) + result = GraphManipulation.serialize_tensor_quantization(q_tensor) + result2 = GraphManipulation.serialize_tensor_quantization(q_tensor_channel) + assert result["q_scheme"] == "torch.per_tensor_affine" + assert result["q_scale"] == 1.0 + assert result2["q_scheme"] == "torch.per_channel_affine" + assert len(result2["q_per_channel_scales"]) == 2 + def test_find_single_partition(self): class TestModule(torch.nn.Module): def forward(self, a, b): return a + b + m = TestModule() traced = symbolic_trace(m) a = torch.rand(1) b = torch.rand(1) - GraphManipulation.get_size_of_all_nodes( - traced, - [a, b] - ) + GraphManipulation.get_size_of_all_nodes(traced, [a, b]) partitioner = Partitioner() devices = [ - Device('dev_0', 125, 0), - Device('dev_1', 125, 1), - Device('dev_2', 125, 2) + Device("dev_0", 125, 0), + Device("dev_1", 125, 1), + Device("dev_2", 125, 2), ] partitioner_config = PartitionerConfig(devices) ret = partitioner.partition_graph(traced, m, partitioner_config) @@ -56,15 +141,12 @@ def forward(self, a, b): traced = symbolic_trace(m) a = torch.rand(4) b = torch.rand(4) - GraphManipulation.get_size_of_all_nodes( - traced, - [a, b] - ) + GraphManipulation.get_size_of_all_nodes(traced, [a, b]) partitioner = Partitioner() devices = [ - Device('dev_0', 125, 0), - Device('dev_1', 125, 1), - Device('dev_2', 125, 2) + Device("dev_0", 125, 0), + Device("dev_1", 125, 1), + Device("dev_2", 125, 2), ] partitioner_config = PartitionerConfig(devices) ret = partitioner.partition_graph(traced, m, partitioner_config) @@ -91,15 +173,9 @@ def forward(self, a): m = TestModule() traced = symbolic_trace(m) a = torch.rand(4) - GraphManipulation.get_size_of_all_nodes( - traced, - [a] - ) + GraphManipulation.get_size_of_all_nodes(traced, [a]) partitioner = Partitioner() - devices = [ - Device('dev_0', 120, 0), - Device('dev_1', 160, 1) - ] + devices = [Device("dev_0", 120, 0), Device("dev_1", 160, 1)] partitioner_config = PartitionerConfig(devices, is_sparse_nn=False) ret = partitioner.partition_graph(traced, m, partitioner_config) module_with_submodules = ret.module_with_submodules @@ -128,12 +204,12 @@ def __init__(self): layers = self.create_mlp(3, 24, 24) self.top_layers = torch.nn.Sequential(*layers) self.embedding_layers = torch.nn.ModuleList() - el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True) + el = torch.nn.EmbeddingBag(500000, 4, mode="sum", sparse=True) self.embedding_layers.append(el) for i in range(3): - el = torch.nn.EmbeddingBag(1000000, 4, mode='sum', sparse=True) + el = torch.nn.EmbeddingBag(1000000, 4, mode="sum", sparse=True) self.embedding_layers.append(el) - el = torch.nn.EmbeddingBag(500000, 4, mode='sum', sparse=True) + el = torch.nn.EmbeddingBag(500000, 4, mode="sum", sparse=True) self.embedding_layers.append(el) def forward(self, a, b, offset): @@ -141,27 +217,29 @@ def forward(self, a, b, offset): y = [] c = [] for i in range(len(self.embedding_layers)): - temp = torch.randint(10, (8, )) + temp = torch.randint(10, (8,)) c.append(temp + b) for i in range(len(self.embedding_layers)): if i % 2 == 0: y.append(self.embedding_layers[i](c[i], offset)) else: - y.append(self.embedding_layers[i](torch.randint(10, (8, )), offset)) + y.append( + self.embedding_layers[i](torch.randint(10, (8,)), offset) + ) z = torch.cat([x] + y, dim=1) p = self.top_layers(z) return p m = MyRecommendationModule() a = torch.rand(2, 4) - b = torch.randint(10, (8, )) - offset = torch.randint(1, (2, )) + b = torch.randint(10, (8,)) + offset = torch.randint(1, (2,)) traced = symbolic_trace(m) GraphManipulation.get_size_of_all_nodes(traced, [a, b, offset]) devices = [ - Device('dev_0', 33000000, 0), - Device('dev_1', 33000000, 1), - Device('dev_2', 33000000, 2) + Device("dev_0", 33000000, 0), + Device("dev_1", 33000000, 1), + Device("dev_2", 33000000, 2), ] partitioner_config = PartitionerConfig(devices, is_sparse_nn=True) partitioner = Partitioner() @@ -187,15 +265,19 @@ def forward(self, a): def get_node_to_latency_mapping(fx_module: GraphModule): """Given a fx module, generate node latency for each node - based on the size of each node + based on the size of each node """ node_to_latency_mapping: Dict[Node, NodeLatency] = {} for node in fx_module.graph.nodes: - if node.op not in {'output', 'placeholder', 'get_attr'}: + if node.op not in {"output", "placeholder", "get_attr"}: if node.size_bytes.total_size == node.size_bytes.output_size: - node_to_latency_mapping[node] = NodeLatency(node.size_bytes.total_size, 2. * node.size_bytes.total_size) + node_to_latency_mapping[node] = NodeLatency( + node.size_bytes.total_size, 2.0 * node.size_bytes.total_size + ) else: - node_to_latency_mapping[node] = NodeLatency(node.size_bytes.total_size, node.size_bytes.output_size) + node_to_latency_mapping[node] = NodeLatency( + node.size_bytes.total_size, node.size_bytes.output_size + ) return node_to_latency_mapping m = TestModule() @@ -203,36 +285,33 @@ def get_node_to_latency_mapping(fx_module: GraphModule): a = torch.rand(4) GraphManipulation.get_size_of_all_nodes(traced, [a]) node_to_latency_mapping = get_node_to_latency_mapping(traced) - devices = [ - Device('dev_0', 200, 0), - Device('dev_1', 200, 1) - ] + devices = [Device("dev_0", 200, 0), Device("dev_1", 200, 1)] partitioner = Partitioner() partitioner_config = PartitionerConfig(devices, False) ret = partitioner.partition_graph(traced, m, partitioner_config) module_with_submodules = ret.module_with_submodules self.assertEqual(traced(a), module_with_submodules(a)) partitions = partitioner.partitions - partition_to_latency_mapping = get_partition_to_latency_mapping(partitions, node_to_latency_mapping) + partition_to_latency_mapping = get_partition_to_latency_mapping( + partitions, node_to_latency_mapping + ) for p in partition_to_latency_mapping: if p.partition_id == 0: - assert partition_to_latency_mapping[p] == (128., 80., 160.) + assert partition_to_latency_mapping[p] == (128.0, 80.0, 160.0) else: - assert partition_to_latency_mapping[p] == (16., 32., 32.) + assert partition_to_latency_mapping[p] == (16.0, 32.0, 32.0) transfer_rate_bytes_per_sec = 0.5 critical_path_latency_sec = get_latency_of_partitioned_graph( - partitions, - partition_to_latency_mapping, - transfer_rate_bytes_per_sec + partitions, partition_to_latency_mapping, transfer_rate_bytes_per_sec ) - assert critical_path_latency_sec == 208. + assert critical_path_latency_sec == 208.0 def test_call_to_assert_no_msg(self): - class M(torch.nn.Module): def forward(self, a, b): assert a == b return a + b + m = M() traced = symbolic_trace_with_rewrite(m) @@ -240,7 +319,12 @@ def forward(self, a, b): traced.graph.lint(traced) # Check the IR to make sure there's a call_function node with target == "Assert" - self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes)) + self.assertTrue( + any( + node.op == "call_function" and node.target == torch.Assert + for node in traced.graph.nodes + ) + ) # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to traced(3, 3) @@ -251,11 +335,11 @@ def forward(self, a, b): self.assertEqual(traced(3, 3), m(3, 3)) def test_call_to_assert_with_msg(self): - class M(torch.nn.Module): def forward(self, a, b): assert a == b, "test message" return a + b + m = M() traced = symbolic_trace_with_rewrite(m) @@ -263,7 +347,12 @@ def forward(self, a, b): traced.graph.lint(traced) # Check the IR to make sure there's a call_function node with target == "Assert" - self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes)) + self.assertTrue( + any( + node.op == "call_function" and node.target == torch.Assert + for node in traced.graph.nodes + ) + ) # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to traced(3, 3) @@ -274,11 +363,11 @@ def forward(self, a, b): self.assertEqual(traced(3, 3), m(3, 3)) def test_call_to_assert_with_empty_msg(self): - class M(torch.nn.Module): def forward(self, a, b): assert a == b, "" return a + b + m = M() traced = symbolic_trace_with_rewrite(m) @@ -286,7 +375,12 @@ def forward(self, a, b): traced.graph.lint(traced) # Check the IR to make sure there's a call_function node with target == "Assert" - self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes)) + self.assertTrue( + any( + node.op == "call_function" and node.target == torch.Assert + for node in traced.graph.nodes + ) + ) # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to traced(3, 3) @@ -297,7 +391,6 @@ def forward(self, a, b): self.assertEqual(traced(3, 3), m(3, 3)) def test_call_to_assert_with_multiline_message(self): - class M(torch.nn.Module): def forward(self, a, b): error_msg = """ @@ -306,6 +399,7 @@ def forward(self, a, b): """ assert a == b, error_msg return a + b + m = M() traced = symbolic_trace_with_rewrite(m) @@ -313,7 +407,12 @@ def forward(self, a, b): traced.graph.lint(traced) # Check the IR to make sure there's a call_function node with target == "Assert" - self.assertTrue(any(node.op == "call_function" and node.target == torch.Assert for node in traced.graph.nodes)) + self.assertTrue( + any( + node.op == "call_function" and node.target == torch.Assert + for node in traced.graph.nodes + ) + ) # Ensure that the assert throws when it's supposed to and doesn't throw when it's not supposed to error_msg = """ @@ -330,7 +429,9 @@ def forward(self, a, b): def test_traceable_function_with_nonstandard_name(self): def foo(x): return torch.relu(x) + traced = symbolic_trace_with_rewrite(foo) -if __name__ == '__main__': + +if __name__ == "__main__": run_tests() diff --git a/test/test_linalg.py b/test/test_linalg.py index 53ba84e6348a..cbab1bde6963 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -1208,6 +1208,145 @@ def test_dot_invalid_args(self, device): self._test_dot_vdot_invalid_args(device, torch.dot) self._test_dot_vdot_invalid_args(device, torch.dot, complex_dtypes=True) + def test_einsum(self, device): + def check(equation, *operands): + ref = np.einsum(equation, *[operand.cpu().numpy() for operand in operands]) + res = torch.einsum(equation, operands) + self.assertEqual(res.cpu(), torch.from_numpy(np.array(ref))) + + # Autograd check (FIXME: tests below fail check) + if equation not in {"i,i->", "i,i->i", "ij,ij->ij"}: + ops = [op.detach().requires_grad_() for op in operands] + self.assertTrue(torch.autograd.gradcheck(lambda *ops: torch.einsum(equation, ops), ops)) + for op in ops: + self.assertTrue(op._version == 0) + + # Test cases from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f + x = torch.rand(5, device=device) + y = torch.rand(7, device=device) + A = torch.randn(3, 5, device=device) + B = torch.randn(2, 5, device=device) + C = torch.randn(2, 3, 5, device=device) + D = torch.randn(2, 5, 7, device=device) + E = torch.randn(7, 9, device=device) + F = torch.randn(2, 3, 3, 5, device=device) + G = torch.randn(5, 4, 6, device=device) + H = torch.randn(4, 4, device=device) + I = torch.rand(2, 3, 2, device=device) + + # Vector operations + check('i->', x) # sum + check('i,i->', x, x) # dot + check('i,i->i', x, x) # vector element-wisem mul + check('i,j->ij', x, y) # outer + + # Matrix operations + check("ij->ji", A) # transpose + check("ij->j", A) # row sum + check("ij->i", A) # col sum + check("ij,ij->ij", A, A) # matrix element-wise mul + check("ij,j->i", A, x) # matrix vector multiplication + check("ij,kj->ik", A, B) # matmul + check("ij,ab->ijab", A, E) # matrix outer product + + # Tensor operations + check("aij,ajk->aik", C, D) # batch matmul + check("ijk,jk->i", C, A) # tensor matrix contraction + check("aij,jk->aik", D, E) # tensor matrix contraction + check("abcd,dfg->abcfg", F, G) # tensor tensor contraction + check("ijk,jk->ik", C, A) # tensor matrix contraction with double indices + check("ijk,jk->ij", C, A) # tensor matrix contraction with double indices + check("ijk,ik->j", C, B) # non contiguous + check("ijk,ik->jk", C, B) # non contiguous with double indices + + # Test diagonals + check("ii", H) # trace + check("ii->i", H) # diagonal + check('iji->j', I) # non-contiguous trace + + # Test ellipsis + check("i...->...", H) + check("ki,...k->i...", A.t(), B) + check("k...,jk", A.t(), B) + check('...ik, ...kj -> ...ij', torch.rand(2, 3, 4), torch.rand(1, 5)) + check('bik,k...j->i...j', torch.rand(5, 2, 3), torch.rand(3, 2)) + check('i...j, ij... -> ...ij', torch.rand(2, 3, 4), torch.rand(2, 4, 2, 3)) + + # torch.bilinear with discontiguous tensors + l = torch.randn(10, 5, device=device).transpose(0, 1) + r = torch.randn(20, 5, device=device).transpose(0, 1) + w = torch.randn(15, 10, 20, device=device) + check("bn,anm,bm->ba", l, w, r) + # with strided tensors + check("bn,anm,bm->ba", l[:, ::2], w[:, ::2, ::2], r[:, ::2]) + + def test_einsum_corner_cases(self, device): + def check(equation, *operands, expected_output): + tensors = [torch.tensor(operand, dtype=torch.float32, device=device) if not isinstance(operand, tuple) + else torch.rand(operand, dtype=torch.float32, device=device) for operand in operands] + output = torch.einsum(equation, tensors) + self.assertEqual(output, torch.tensor(expected_output, dtype=torch.float32, device=device)) + + # Test equation variantions + check(' ', 1, expected_output=1) + check(' -> ', 1, expected_output=1) + check(' , ', 2, 2, expected_output=4) + check(' , , ', 2, 2, 2, expected_output=8) + check(' , -> ', 2, 2, expected_output=4) + check(' i ', [1], expected_output=[1]) + check(' i -> ', [1], expected_output=1) + check(' i -> i ', [1], expected_output=[1]) + check(' i , i ', [2], [2], expected_output=4) + check(' i , i -> i ', [2], [2], expected_output=[4]) + + # Test tensors with 0 size dimensions + check('i', [], expected_output=[]) + check(' i j -> j', [[], []], expected_output=[]) + check('ij->i', [[], []], expected_output=[0., 0.]) + check(' i j k , k -> i j ', (3, 0, 6), (6,), expected_output=[[], [], []]) + + # Test broadcasting + check('i,j', [2], [1, 2], expected_output=[[2, 4]]) + check('i,ij->ij', [1, 2], [[1, 2, 3], [2, 3, 4]], expected_output=[[1, 2, 3], [4, 6, 8]]) + + # Test ellipsis broadcasting + check('...', 1, expected_output=1) + check('...->', 1, expected_output=1) + check('...->...', 1, expected_output=1) + check('i...->i', [1], expected_output=[1]) + check('i...->...i', [1], expected_output=[1]) + + def test_einsum_error_cases(self, device): + def check(equation, operands, regex, exception=RuntimeError): + with self.assertRaisesRegex(exception, r'einsum\(\) ' + regex): + torch.einsum(equation, operands) + + x = torch.rand(2) + y = torch.rand(2, 3) + + check('', [], r'must provide at least one operand') + check('. ..', [x], r'found \'.\' for operand 0 that is not part of any ellipsis') + check('... ...', [x], r'found \'.\' for operand 0 for which an ellipsis was already found') + check('A', [x], r'operand subscript must be in range \[a, z\] but found A for operand 0') + check(',', [x], r'fewer operands were provided than specified in the equation') + check('', [x, x], r'more operands were provided than specified in the equation') + check('', [x], r'the number of subscripts in the equation \(0\) does not match the number ' + r'of dimensions \(1\) for operand 0 and no ellipsis was given') + check('ai', [x], r'the number of subscripts in the equation \(2\) does not match the number ' + r'of dimensions \(1\) for operand 0 and no ellipsis was given') + check('ai...', [x], r'the number of subscripts in the equation \(2\) is more than the number ' + r'of dimensions \(1\) for operand 0') + check('a->... .', [x], r'found \'.\' for output but an ellipsis \(...\) was already found') + check('a->..', [x], r'found \'.\' for output that is not part of any ellipsis \(...\)') + check('a->A', [x], r'subscripts must be in range \[a, z\] but found A for the output') + check('a->aa', [x], r'output subscript a appears more than once in the output') + check('a->i', [x], r'output subscript i does not appear in the equation for any input operand') + check('...->', [x], r'ellipsis \(...\) covering one or more dimensions was given in the input ' + r'but not in the output') + check('aa', [y], r'subscript a is repeated for operand 0 but the sizes don\'t match, 3 != 2') + check('a, ba', [x, y], r'operands do not broadcast with remapped shapes \[original->remapped\]: ' + r'\[2\]->\[1, 2\] \[2, 3\]->\[2, 3\]') + instantiate_device_type_tests(TestLinalg, globals()) if __name__ == '__main__': diff --git a/test/test_overrides.py b/test/test_overrides.py index 4734b3bc7c91..f12d9ace9cbd 100644 --- a/test/test_overrides.py +++ b/test/test_overrides.py @@ -821,6 +821,22 @@ def test_gradcheck(self): torch.add, }) +class TestNamedTuple(TestCase): + "Regression test for gh-47090" + def test_max(self): + x = torch.tensor([1, 2]) + xs = x.as_subclass(SubTensor2) + r = torch.max(x, dim=0) + rs = torch.max(xs, dim=0) + self.assertEqual(type(r), type(rs)) + self.assertEqual(r, rs) + +class TestGradNewOnesOverride(TestCase): + """ Regression test for gh-47069 """ + def test_newones(self): + t = torch.tensor([1, 2]).as_subclass(SubTensor2) + n = t.new_ones((1, 2)) + self.assertEqual(type(n), SubTensor2) if __name__ == '__main__': unittest.main() diff --git a/test/test_torch.py b/test/test_torch.py index ba9492c500f4..fce680b2b7af 100644 --- a/test/test_torch.py +++ b/test/test_torch.py @@ -1,6 +1,5 @@ import sys import io -import gc import inspect import itertools import math @@ -40,7 +39,7 @@ onlyCUDA, onlyCPU, \ dtypes, dtypesIfCUDA, dtypesIfCPU, deviceCountAtLeast, skipCUDAIf, precisionOverride, \ PYTORCH_CUDA_MEMCHECK, largeTensorTest, onlyOnCPUAndCUDA, expectedAlertNondeterministic -from typing import Dict, List, Tuple, Union +from typing import Dict, List import torch.backends.quantized import torch.testing._internal.data from torch.testing._internal.common_cuda import tf32_on_and_off, tf32_is_not_fp32, with_tf32_off @@ -2951,20 +2950,34 @@ def test_parsing_intlist(self): lambda: torch.tensor().new_zeros((5, 5), 0)) def test_half_tensor(self): - x = torch.randn(5, 5).float() - y = torch.randn(5, 5).float() - xh, yh = x.half(), y.half() + devices = ["cpu"] + if torch.cuda.is_available(): + devices.append("cuda") - self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0) + # contiguous tensor + # non-contiguous tensor + # dense non-overlapping tensor + # non-dense non-overlapping sliced tensor + # non-dense overlapping equal strides + for device in devices: + tset = ( + torch.randn(4, 3, 2, device=device, dtype=torch.float).contiguous(), + torch.randn(4, 3, 2, device=device, dtype=torch.float).transpose(0, 1), + torch.randn(4, 3, 2, device=device, dtype=torch.float), + torch.randn(4, 3, 2, device=device, dtype=torch.float)[:, :, ::2], + torch.empty_strided( + (4, 2, 3), (10, 3, 3), device=device, dtype=torch.float + ).copy_(torch.rand((4, 2, 3), dtype=torch.float, device=device)), + ) - z = torch.Tensor(5, 5) - self.assertEqual(z.copy_(xh), x, atol=1e-3, rtol=0) - - with tempfile.NamedTemporaryFile() as f: - torch.save(xh, f) - f.seek(0) - xh2 = torch.load(f) - self.assertEqual(xh.float(), xh2.float()) + for x in tset: + self.assertEqual(x.half().float(), x, atol=1e-3, rtol=0) + xh = x.half() + with tempfile.NamedTemporaryFile() as f: + torch.save(xh, f) + f.seek(0) + xh2 = torch.load(f) + self.assertEqual(xh.float(), xh2.float()) def test_from_buffer(self): a = bytearray([1, 2, 3, 4]) @@ -7991,7 +8004,6 @@ def test_cholesky(self, device, dtype): B = torch.mm(L, L.t().conj()) self.assertEqual(A, B, atol=1e-14, rtol=0, msg='cholesky (lower) did not allow rebuilding the original matrix') - @skipIfRocm # This test has many dimensions, which is larger than the maximum dims supported by ROCm (16) def test_view(self, device): tensor = torch.rand(15, device=device) template = torch.rand(3, 5, device=device) @@ -9311,6 +9323,11 @@ def test_kthvalue(self, device, dtype): self.assertEqual(res1val[:, :], res2val[:, :, k - 1], atol=0, rtol=0) self.assertEqual(res1ind[:, :], res2ind[:, :, k - 1], atol=0, rtol=0) + # Test scalar input (test case from https://github.com/pytorch/pytorch/issues/30818) + # Tests that passing a scalar tensor or 1D tensor with 1 element work either way + x = torch.tensor([2], device=device, dtype=dtype) + self.assertEqual(x.squeeze().kthvalue(1), x.kthvalue(1)) + @skipCUDAIfNoMagma @skipCPUIfNoLapack @unittest.skipIf(not TEST_NUMPY, "NumPy not found") @@ -9649,7 +9666,7 @@ def test_multidim(x, dim): expected = fn(y, 1, keepdim=False) self.assertEqual(x[:, 1], expected, msg='{} with out= kwarg'.format(fn_name)) - @onlyCUDA + @slowTest @largeTensorTest('10GB') def test_reduction_split(self, device): # Test reduction when there is a 32bit-indexing split @@ -9658,13 +9675,6 @@ def test_reduction_split(self, device): result = input_.sum(dim=0) expect = input_[0] + input_[1] + input_[2] + input_[3] + input_[4] self.assertEqual(result, expect) - gc.collect() - torch.cuda.empty_cache() - a = torch.randn(8, 1, 128, 1024, 1024, device=device, dtype=torch.half) - self.assertEqual((a.sum(1) - a.squeeze()).abs().max(), 0) - gc.collect() - torch.cuda.empty_cache() - self.assertEqual((a.sum(1, keepdim=True) - a).abs().max(), 0) @onlyCUDA @dtypes(torch.half, torch.float, torch.double) @@ -16280,81 +16290,6 @@ def test_helper(min, max): test_helper(torch.finfo(dtype).tiny, torch.finfo(dtype).max) - @onlyCPU - @slowTest - @unittest.skipIf(not TEST_NUMPY, 'Numpy not found') - @dtypes(torch.double) - def test_einsum(self, device: torch.device, dtype: torch.dtype) -> None: - # test cases taken from https://gist.github.com/rockt/15ee013889d65342088e9260a377dc8f - x = torch.randn(5, dtype=dtype, device=device) - y = torch.randn(7, dtype=dtype, device=device) - A = torch.randn(3, 5, dtype=dtype, device=device) - B = torch.randn(2, 5, dtype=dtype, device=device) - C = torch.randn(2, 3, 5, dtype=dtype, device=device) - D = torch.randn(2, 5, 7, dtype=dtype, device=device) - E = torch.randn(7, 9, dtype=dtype, device=device) - F = torch.randn(2, 3, 5, 7, dtype=dtype, device=device) - G = torch.randn(7, 11, 13, dtype=dtype, device=device) - H = torch.randn(4, 4, dtype=dtype, device=device) - I = torch.randn(3, 4, 4, dtype=dtype, device=device) - l = torch.randn(5, 10, dtype=dtype, device=device) - r = torch.randn(5, 20, dtype=dtype, device=device) - w = torch.randn(30, 10, 20, dtype=dtype, device=device) - test_list: List[Union[Tuple[str, torch.Tensor], - Tuple[str, torch.Tensor, torch.Tensor], - Tuple[str, torch.Tensor, torch.Tensor, torch.Tensor]]] = [ - # -- Vector - ("i->", x), # sum - ("i,i->", x, x), # dot - ("i,i->i", x, x), # vector element-wise mul - ("i,j->ij", x, y), # outer - # -- Matrix - ("ij->ji", A), # transpose - ("ij->j", A), # row sum - ("ij->i", A), # col sum - ("ij,ij->ij", A, A), # matrix element-wise mul - ("ij,j->i", A, x), # matrix vector multiplication - ("ij,kj->ik", A, B), # matmul - ("ij,ab->ijab", A, E), # matrix outer product - # -- Tensor - ("aij,ajk->aik", C, D), # batch matmul - ("ijk,jk->i", C, A), # tensor matrix contraction - ("aij,jk->aik", D, E), # tensor matrix contraction - ("abcd,dfg->abcfg", F, G), # tensor tensor contraction - ("ijk,jk->ik", C, A), # tensor matrix contraction with double indices - ("ijk,jk->ij", C, A), # tensor matrix contraction with double indices - ("ijk,ik->j", C, B), # non contiguous - ("ijk,ik->jk", C, B), # non contiguous with double indices - # -- Diagonal - ("ii", H), # trace - ("ii->i", H), # diagonal - # -- Ellipsis - ("i...->...", H), - ("ki,...k->i...", A.t(), B), - ("k...,jk", A.t(), B), - ("...ii->...i", I), # batch diagonal - # -- Other - ("bn,anm,bm->ba", l, w, r), # as torch.bilinear - ("... ii->...i ", I), # batch diagonal with spaces - ] - for test in test_list: - actual = torch.einsum(test[0], test[1:]) - expected = np.einsum(test[0], *[t.numpy() for t in test[1:]]) - self.assertEqual(expected.shape, actual.shape, msg=test[0]) - self.assertEqual(expected, actual, msg=test[0]) - # test vararg - actual2 = torch.einsum(test[0], *test[1:]) - self.assertEqual(expected.shape, actual2.shape, msg=test[0]) - self.assertEqual(expected, actual2, msg=test[0]) - - def do_einsum(*args): - return torch.einsum(test[0], args) - # FIXME: following test cases fail gradcheck - if test[0] not in {"i,i->", "i,i->i", "ij,ij->ij"}: - gradcheck_inps = tuple(t.detach().requires_grad_() for t in test[1:]) - self.assertTrue(torch.autograd.gradcheck(do_einsum, gradcheck_inps)) - self.assertTrue(A._version == 0) # check that we do not use inplace ops - @onlyCPU @dtypes(torch.bool, torch.double) def test_sum_all(self, device, dtype) -> None: @@ -17347,8 +17282,11 @@ def _test_copysign_numpy(a, b): # Use double copysign to verify the correctnes of 0.0 and -0.0, since # it always True for self.assertEqual(0.0 == -0.0). So, we use 1 as the # magnitude to verify the sign between torch and numpy results, elementwise. - self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result), - torch.copysign(torch.tensor(1.0), expected)) + # Special case: NaN conversions between FP32 and FP16 is not bitwise + # equivalent to pass this assertion. + if a.dtype != torch.float16 and b.dtype != torch.float16: + self.assertEqual(torch.copysign(torch.tensor(1.0), torch_result), + torch.copysign(torch.tensor(1.0), expected)) # Compare Result with NumPy # Type promotion @@ -19155,11 +19093,7 @@ def test_nansum_out_dtype(self, device): torch_fn = partial(torch.nansum, dtype=out_dtype) np_out_dtype = torch_to_numpy_dtype_dict[out_dtype] np_fn = partial(np.nansum, dtype=np_out_dtype) - if (inp_dtype, out_dtype) == (torch.uint8, torch.float16): - # 25504.0 vs 25536.0 - self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None, atol=0, rtol=0.002) - else: - self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None) + self.compare_with_numpy(torch_fn, np_fn, x, device=None, dtype=None) @dtypes(torch.int32, torch.int64) def test_large_linspace(self, device, dtype): @@ -19599,6 +19533,50 @@ def test_dstack(self, device, dtype): expected = np.dstack(np_input) self.assertEqual(actual, expected) + @unittest.skipIf(not TEST_NUMPY, "NumPy not found") + @dtypes(*(torch.testing.get_all_dtypes(include_half=True, include_bfloat16=False, + include_bool=True, include_complex=False))) + def test_all_any_vs_numpy(self, device, dtype): + def _test_all_any(x): + self.compare_with_numpy(torch.all, np.all, x) + self.compare_with_numpy(torch.any, np.any, x) + + def _test_all_any_with_dim(x, dim): + torch_fn = partial(torch.all, dim=dim) + np_fn = partial(np.all, axis=dim) + self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False) + + torch_fn = partial(torch.any, dim=dim) + np_fn = partial(np.any, axis=dim) + self.compare_with_numpy(torch_fn, np_fn, x, exact_dtype=False) + + for ndim in range(5): + shape = self._rand_shape(ndim, 1, 5) + x = self._generate_input(shape, dtype, device, with_extremal=False) + _test_all_any(x) + + x = self._generate_input(shape, dtype, device, with_extremal=True) + _test_all_any(x) + + x = torch.zeros_like(x) + _test_all_any(x) + + x = torch.ones_like(x) + _test_all_any(x) + + for dim in range(ndim): + x = self._generate_input(shape, dtype, device, with_extremal=False) + _test_all_any_with_dim(x, dim) + + x = self._generate_input(shape, dtype, device, with_extremal=True) + _test_all_any_with_dim(x, dim) + + x = torch.zeros_like(x) + _test_all_any_with_dim(x, dim) + + x = torch.ones_like(x) + _test_all_any_with_dim(x, dim) + @onlyOnCPUAndCUDA def test_repeated_dim(self, device): ops = [torch.mean, torch.sum, torch.nansum, torch.std, torch.logsumexp, torch.std, torch.var, diff --git a/test/test_unary_ufuncs.py b/test/test_unary_ufuncs.py index 9f3353376913..95287da20755 100644 --- a/test/test_unary_ufuncs.py +++ b/test/test_unary_ufuncs.py @@ -9,7 +9,7 @@ from torch.testing._internal.common_utils import \ (TestCase, run_tests, torch_to_numpy_dtype_dict, suppress_warnings, - TEST_NUMPY, make_tensor) + TEST_NUMPY, IS_MACOS, make_tensor) from torch.testing._internal.common_methods_invocations import \ (unary_ufuncs) from torch.testing._internal.common_device_type import \ @@ -480,6 +480,16 @@ def test_nan_to_num(self, device, dtype): torch.nan_to_num(x, out=out, nan=nan, posinf=posinf, neginf=neginf) self.assertEqual(result, out) + @unittest.skipIf(IS_MACOS, "Skip Reference: https://github.com/pytorch/pytorch/issues/47500") + @dtypes(torch.cfloat, torch.cdouble) + def test_sqrt_complex_edge_values(self, device, dtype): + # Test Reference: https://github.com/pytorch/pytorch/pull/47424 + x = torch.tensor(0. - 1.0000e+20j, dtype=dtype, device=device) + self.compare_with_numpy(torch.sqrt, np.sqrt, x) + + x = torch.tensor(-1.0000e+20 - 4988429.2000j, dtype=dtype, device=device) + self.compare_with_numpy(torch.sqrt, np.sqrt, x) + instantiate_device_type_tests(TestUnaryUfuncs, globals()) if __name__ == '__main__': diff --git a/tools/autograd/templates/python_variable_methods.cpp b/tools/autograd/templates/python_variable_methods.cpp index 4b441d6f3616..15fd600e441c 100644 --- a/tools/autograd/templates/python_variable_methods.cpp +++ b/tools/autograd/templates/python_variable_methods.cpp @@ -56,7 +56,7 @@ static PyObject * THPVariable__is_view(PyObject *self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "_is_view"); + return handle_torch_function(self, "_is_view", args); } auto& self_ = reinterpret_cast(self)->cdata; if (self_.is_view()) { @@ -160,7 +160,7 @@ static PyObject * THPVariable_get_device(PyObject* self_, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self_)) { - return handle_torch_function(self_, "get_device"); + return handle_torch_function(self_, "get_device", args, nullptr); } auto& self = reinterpret_cast(self_)->cdata; return wrap(self.get_device()); @@ -171,7 +171,7 @@ static PyObject * THPVariable_has_names(PyObject* self_, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self_)) { - return handle_torch_function(self_, "has_names"); + return handle_torch_function(self_, "has_names", args); } auto& self = reinterpret_cast(self_)->cdata; return wrap(self.has_names()); @@ -183,7 +183,7 @@ static PyObject * THPVariable_data_ptr(PyObject* self_, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self_)) { - return handle_torch_function(self_, "data_ptr"); + return handle_torch_function(self_, "data_ptr", args); } auto& self = reinterpret_cast(self_)->cdata; return wrap(self.data_ptr()); @@ -207,7 +207,7 @@ static PyObject * THPVariable_dim(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "dim"); + return handle_torch_function(self, "dim", args); } auto& self_ = reinterpret_cast(self)->cdata; return THPUtils_packInt64(self_.dim()); @@ -219,7 +219,7 @@ static PyObject * THPVariable_numel(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "numel"); + return handle_torch_function(self, "numel", args); } auto& self_ = reinterpret_cast(self)->cdata; return THPUtils_packInt64(self_.numel()); @@ -333,7 +333,7 @@ static bool dispatch_to_Bool(const Tensor & self) { static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "__float__"); + return handle_torch_function(self, "__float__", args); } jit::tracer::warn("Converting a tensor to a Python float", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; @@ -344,7 +344,7 @@ static PyObject * THPVariable_float_scalar(PyObject* self, PyObject* args) { static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "__complex__"); + return handle_torch_function(self, "__complex__", args); } jit::tracer::warn("Converting a tensor to a Python complex", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; @@ -355,7 +355,7 @@ static PyObject * THPVariable_complex_scalar(PyObject* self, PyObject* args) { static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "__int__"); + return handle_torch_function(self, "__int__", args); } jit::tracer::warn("Converting a tensor to a Python integer", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; @@ -374,7 +374,7 @@ static PyObject * THPVariable_integral_scalar(PyObject* self, PyObject* args) { static PyObject * THPVariable_index_scalar(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "__index__"); + return handle_torch_function(self, "__index__", args); } jit::tracer::warn("Converting a tensor to a Python index", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; @@ -396,7 +396,7 @@ static Tensor dispatch_invert(const Tensor & self) { static PyObject * THPVariable_invert(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "__invert__"); + return handle_torch_function(self, "__invert__", args); } auto& self_ = reinterpret_cast(self)->cdata; if (!isIntegralType(self_.scalar_type(), /*includeBool=*/true)) { @@ -691,7 +691,7 @@ static PyObject * THPVariable_element_size(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "element_size"); + return handle_torch_function(self, "element_size", args); } auto& self_ = reinterpret_cast(self)->cdata; return THPUtils_packInt64(self_.element_size()); @@ -769,7 +769,7 @@ static PyObject * THPVariable_item(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "item"); + return handle_torch_function(self, "item", args); } jit::tracer::warn("Converting a tensor to a Python number", jit::tracer::WARN_PYTHON_DATAFLOW); auto& self_ = reinterpret_cast(self)->cdata; @@ -838,7 +838,7 @@ static PyObject * THPVariable_new(PyObject* self, PyObject* args, PyObject* kwar { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "new"); + return handle_torch_function(self, "new", args, kwargs); } auto& self_ = reinterpret_cast(self)->cdata; OptionalDeviceGuard device_guard(device_of(self_)); @@ -850,7 +850,7 @@ static PyObject * THPVariable_new_ones(PyObject* self, PyObject* args, PyObject* { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "new_ones"); + return handle_torch_function(self, "new_ones", args, kwargs); } auto& self_ = reinterpret_cast(self)->cdata; OptionalDeviceGuard device_guard(device_of(self_)); @@ -862,7 +862,7 @@ static PyObject * THPVariable_new_tensor(PyObject* self, PyObject* args, PyObjec { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "new_tensor"); + return handle_torch_function(self, "new_tensor", args, kwargs); } auto& self_ = reinterpret_cast(self)->cdata; OptionalDeviceGuard device_guard(device_of(self_)); @@ -941,7 +941,7 @@ static PyObject * THPVariable_tolist(PyObject* self, PyObject* args) { HANDLE_TH_ERRORS if (check_has_torch_function(self)) { - return handle_torch_function(self, "tolist"); + return handle_torch_function(self, "tolist", args); } jit::tracer::warn("Converting a tensor to a Python list", jit::tracer::WARN_PYTHON_DATAFLOW); auto self_ = reinterpret_cast(self)->cdata; @@ -1010,7 +1010,7 @@ static PyObject * THPVariable_type(PyObject* self, PyObject* args, PyObject* kwa static PyObject * THPVariable_bool_scalar(PyObject* self, PyObject* args) { if (check_has_torch_function(self)) { HANDLE_TH_ERRORS - return handle_torch_function(self, "__bool__"); + return handle_torch_function(self, "__bool__", args); END_HANDLE_TH_ERRORS } jit::tracer::warn("Converting a tensor to a Python boolean", jit::tracer::WARN_PYTHON_DATAFLOW); diff --git a/tools/codegen/gen.py b/tools/codegen/gen.py index 2b8f4fc64959..0ed2dff543fe 100644 --- a/tools/codegen/gen.py +++ b/tools/codegen/gen.py @@ -10,6 +10,7 @@ import pathlib import functools import json +from dataclasses import dataclass from tools.codegen.code_template import CodeTemplate from tools.codegen.model import * @@ -102,13 +103,25 @@ def parse_native_yaml(path: str) -> List[NativeFunction]: def with_native_function(func: Callable[[NativeFunction], T]) -> Callable[[NativeFunction], T]: @functools.wraps(func) def wrapper(f: NativeFunction) -> T: - with context(f'in {f.loc}:\n {f.func}'): - with local.parametrize( - use_c10_dispatcher=f.use_c10_dispatcher, - ): - return func(f) + with native_function_manager(f): + return func(f) return wrapper +def method_with_native_function(func: Callable[[S, NativeFunction], T]) -> Callable[[S, NativeFunction], T]: + @functools.wraps(func) + def wrapper(slf: S, f: NativeFunction) -> T: + with native_function_manager(f): + return func(slf, f) + return wrapper + +@contextlib.contextmanager +def native_function_manager(f: NativeFunction) -> Iterator[None]: + with context(f'in {f.loc}:\n {f.func}'): + with local.parametrize( + use_c10_dispatcher=f.use_c10_dispatcher, + ): + yield + # These two functions purposely return generators in analogy to map() # so that you don't mix up when you need to list() them @@ -180,49 +193,53 @@ def cpp_string(s: str) -> str: # # This function is also used for a secondary purpose: the registration # logic is also reused to implement per-operator registration. -def compute_type_method( - dispatch: Optional[str], *, +@dataclass(frozen=True) +class ComputeTypeMethod: + dispatch: Optional[str] + # TODO: Give more precise type Union[Literal[Target.DEFINITION, # Target.REGISTRATION]]; requires Literal from typing_extensions # which we don't have a dep for yet. - target: Target, + target: Target + # Selector object to determine which operators to generate # registration code for. selector: SelectiveBuilder -) -> Callable[[NativeFunction], Optional[str]]: - if dispatch is None: - assert target is Target.REGISTRATION + def __post_init__(self) -> None: + assert self.target is not Target.DECLARATION + if self.dispatch is None: + assert self.target is Target.REGISTRATION - @with_native_function - def func(f: NativeFunction) -> Optional[str]: - # Has to be here as mypy won't transfer asserts into closures - assert target is not Target.DECLARATION + @method_with_native_function + def __call__(self, f: NativeFunction) -> Optional[str]: + # for mypy type refinement; would be fixed by TODO on target + assert self.target is not Target.DECLARATION - if dispatch is not None: - if dispatch not in f.dispatch: + if self.dispatch is not None: + if self.dispatch not in f.dispatch: return None op_name = f"aten::{f.func.name}" - if target is Target.REGISTRATION and not selector.is_operator_selected(op_name): + if self.target is Target.REGISTRATION and not self.selector.is_operator_selected(op_name): return None name = native.name(f.func) returns_type = native.returns_type(f.func.returns) args = native.arguments(f.func) args_str = ', '.join(map(str, args)) - dispatch_to_all_backends = dispatch is not None and dispatch in KEYWORD_ALL_BACKENDS + dispatch_to_all_backends = self.dispatch is not None and self.dispatch in KEYWORD_ALL_BACKENDS - if target is Target.DEFINITION: - assert dispatch is not None - impl_name = f"at::native::{f.dispatch[dispatch]}" + if self.target is Target.DEFINITION: + assert self.dispatch is not None + impl_name = f"at::native::{f.dispatch[self.dispatch]}" args_exprs_str = ', '.join(a.name for a in args) return_kw = " return " cuda_guard = "" - if dispatch_to_all_backends or 'CUDA' in dispatch: + if dispatch_to_all_backends or 'CUDA' in self.dispatch: self_args = (a for a in f.func.arguments if a.name == "self") # There is precedence for which argument we use to do @@ -249,7 +266,7 @@ def func(f: NativeFunction) -> Optional[str]: # works just as well. if f.device_guard and dispatch_to_all_backends and has_tensor_options: cuda_guard = cuda_guard_from_tensor_options - elif f.device_guard and dispatch is not None and 'CUDA' in dispatch and has_tensor_options: + elif f.device_guard and self.dispatch is not None and 'CUDA' in self.dispatch and has_tensor_options: cuda_guard = f"""\ globalContext().lazyInitCUDA(); {cuda_guard_from_tensor_options} @@ -269,8 +286,8 @@ def func(f: NativeFunction) -> Optional[str]: }} """ - elif target is Target.REGISTRATION: - if dispatch is None: + elif self.target is Target.REGISTRATION: + if self.dispatch is None: return f'm.def({cpp_string(str(f.func))});\n' elif f.manual_kernel_registration: return None @@ -278,7 +295,7 @@ def func(f: NativeFunction) -> Optional[str]: if dispatch_to_all_backends: type_name = f'TypeDefault::{name}' else: - type_name = f'{dispatch}Type::{name}' + type_name = f'{self.dispatch}Type::{name}' dispatcher_sig = DispatcherSignature.from_schema(f.func) @@ -302,21 +319,22 @@ def func(f: NativeFunction) -> Optional[str]: # in a TORCH_LIBRARY_FRAGMENT that does not have an ambient backend. So # the torch::dispatch specification here is important! See # Note [Redundancy in registration code is OK] for how we handle redundant info. - if dispatch is not None: - payload = f"torch::dispatch(DispatchKey::{dispatch},\n{payload})\n" + if self.dispatch is not None: + payload = f"torch::dispatch(DispatchKey::{self.dispatch},\n{payload})\n" return f'm.impl("{f.func.name}",\n{payload});\n' else: - assert_never(target) - - return func + assert_never(self.target) # Generates Function.cpp and Function.h. These files provide the # functional public C++ API, and the scaffolding to call into # the dispatcher from these functions. See also compute_tensor_method. -def compute_function(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: - @with_native_function - def go(f: NativeFunction) -> Optional[str]: +@dataclass(frozen=True) +class ComputeFunction: + target: Target + + @method_with_native_function + def __call__(self, f: NativeFunction) -> Optional[str]: if f.manual_kernel_registration: return None if Variant.function not in f.variants: @@ -326,13 +344,13 @@ def go(f: NativeFunction) -> Optional[str]: sig_group = CppSignatureGroup.from_schema(f.func, method=False) - if target is Target.DECLARATION: + if self.target is Target.DECLARATION: result = f"CAFFE2_API {sig_group.signature.decl()};\n" if sig_group.faithful_signature is not None: result += f"CAFFE2_API {sig_group.faithful_signature.decl()};\n" return result - assert target is Target.DEFINITION + assert self.target is Target.DEFINITION def generate_defn(sig: CppSignature) -> str: dispatcher_sig = DispatcherSignature.from_schema(f.func) @@ -357,14 +375,15 @@ def generate_defn(sig: CppSignature) -> str: return result - return go - # Generates TensorBody.h (sic) and TensorMethods.cpp. These files provide the # object-oriented (method-based) public C++ API, and the scaffolding to call into # the dispatcher from these functions. See also compute_function. -def compute_tensor_method(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: - @with_native_function - def go(f: NativeFunction) -> Optional[str]: +@dataclass(frozen=True) +class ComputeTensorMethod: + target: Target + + @method_with_native_function + def __call__(self, f: NativeFunction) -> Optional[str]: if Variant.method not in f.variants: return None @@ -376,13 +395,13 @@ def go(f: NativeFunction) -> Optional[str]: sig_group = CppSignatureGroup.from_schema(f.func, method=True) - if target is Target.DECLARATION: + if self.target is Target.DECLARATION: result = f"{sig_group.signature.decl()} const;\n" if sig_group.faithful_signature is not None: result += f"{sig_group.faithful_signature.decl()} const;\n" return result - assert target is Target.DEFINITION + assert self.target is Target.DEFINITION def generate_defn(sig: CppSignature) -> str: dispatcher_sig = DispatcherSignature.from_schema(f.func) @@ -406,8 +425,6 @@ def generate_defn(sig: CppSignature) -> str: return result - return go - # Generates ATenOpList.cpp, a runtime accessible list of all aten # operators. # TODO: This was historically used to help some JIT interop code @@ -442,9 +459,12 @@ def compute_native_function_declaration(f: NativeFunction) -> List[str]: # Generates BackendSelectRegister.cpp, a series of kernels which provide # specialized computation of dispatch key for operator signatures which cannot # be easily done automatically using templating. -def compute_backend_select(*, target: Target) -> Callable[[NativeFunction], Optional[str]]: - @with_native_function - def go(f: NativeFunction) -> Optional[str]: +@dataclass(frozen=True) +class ComputeBackendSelect: + target: Target + + @method_with_native_function + def __call__(self, f: NativeFunction) -> Optional[str]: if str(f.func.name.name).endswith('_like') or str(f.func.name.name).startswith('new_'): return None @@ -471,7 +491,7 @@ def go(f: NativeFunction) -> Optional[str]: dispatcher_exprs = native_sig.dispatcher_exprs() dispatch_key = "options.computeDispatchKey()" - if target is Target.DEFINITION: + if self.target is Target.DEFINITION: # I don't think there's actually a good reason to generate # these two cases differently # The first case could probably be improved though- it calls dispatchTypeId(), @@ -494,7 +514,7 @@ def go(f: NativeFunction) -> Optional[str]: return op.callWithDispatchKey(_dk, {', '.join(a.expr for a in dispatcher_exprs)}); }} """ - elif target is Target.REGISTRATION: + elif self.target is Target.REGISTRATION: if local.use_c10_dispatcher() is UseC10Dispatcher.full: return f"""m.impl("aten::{f.func.name}", TORCH_FN({name}));""" elif local.use_c10_dispatcher() is UseC10Dispatcher.hacky_wrapper_for_legacy_signatures: @@ -504,11 +524,10 @@ def go(f: NativeFunction) -> Optional[str]: else: assert local.use_c10_dispatcher() is UseC10Dispatcher.with_codegenerated_unboxing_wrapper return f"""m.impl_UNBOXED("aten::{f.func.name}", {name});""" - elif target is Target.DECLARATION: + elif self.target is Target.DECLARATION: raise AssertionError() else: - assert_never(target) - return go + assert_never(self.target) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # # @@ -993,12 +1012,11 @@ def make_file_manager(install_dir: str) -> FileManager: '', 'Backend': dispatch, 'type_derived_method_definitions': list(mapMaybe( - compute_type_method(dispatch, target=Target.DEFINITION, selector=selector), + ComputeTypeMethod(dispatch, Target.DEFINITION, selector), native_functions )), 'function_registrations': list(mapMaybe( - compute_type_method( - dispatch, target=Target.REGISTRATION, selector=selector), + ComputeTypeMethod(dispatch, Target.REGISTRATION, selector), native_functions )), }) @@ -1012,35 +1030,35 @@ def make_file_manager(install_dir: str) -> FileManager: cpu_fm.write('TypeDefault.cpp', lambda: { 'type_method_definitions': list(mapMaybe( - compute_type_method('Math', target=Target.DEFINITION, selector=selector), + ComputeTypeMethod('Math', Target.DEFINITION, selector), native_functions)) + list(mapMaybe( - compute_type_method('DefaultBackend', target=Target.DEFINITION, selector=selector), + ComputeTypeMethod('DefaultBackend', Target.DEFINITION, selector), native_functions)), 'function_registrations': list(mapMaybe( - compute_type_method(None, target=Target.REGISTRATION, selector=schema_selector), + ComputeTypeMethod(None, Target.REGISTRATION, schema_selector), native_functions)), 'math_function_registrations': list(mapMaybe( - compute_type_method('Math', target=Target.REGISTRATION, selector=selector), + ComputeTypeMethod('Math', Target.REGISTRATION, selector), native_functions)), 'default_backend_function_registrations': list(mapMaybe( - compute_type_method('DefaultBackend', target=Target.REGISTRATION, selector=selector), + ComputeTypeMethod('DefaultBackend', Target.REGISTRATION, selector), native_functions)), }) cpu_fm.write('Functions.h', lambda: { - 'function_declarations': list(mapMaybe(compute_function(target=Target.DECLARATION), native_functions)), + 'function_declarations': list(mapMaybe(ComputeFunction(Target.DECLARATION), native_functions)), }) cpu_fm.write('Functions.cpp', lambda: { - 'function_definitions': list(mapMaybe(compute_function(target=Target.DEFINITION), native_functions)), + 'function_definitions': list(mapMaybe(ComputeFunction(Target.DEFINITION), native_functions)), }) core_fm.write('TensorBody.h', lambda: { - 'tensor_method_declarations': list(mapMaybe(compute_tensor_method(target=Target.DECLARATION), native_functions)), + 'tensor_method_declarations': list(mapMaybe(ComputeTensorMethod(Target.DECLARATION), native_functions)), }) core_fm.write('TensorMethods.cpp', lambda: { - 'tensor_method_definitions': list(mapMaybe(compute_tensor_method(target=Target.DEFINITION), native_functions)), + 'tensor_method_definitions': list(mapMaybe(ComputeTensorMethod(Target.DEFINITION), native_functions)), }) core_fm.write('ATenOpList.cpp', lambda: { 'aten_ops': list(mapMaybe(compute_aten_op, native_functions)), @@ -1050,9 +1068,9 @@ def make_file_manager(install_dir: str) -> FileManager: }) cpu_fm.write('BackendSelectRegister.cpp', lambda: { 'backend_select_method_definitions': - list(mapMaybe(compute_backend_select(target=Target.DEFINITION), native_functions)), + list(mapMaybe(ComputeBackendSelect(Target.DEFINITION), native_functions)), 'backend_select_function_registrations': - list(mapMaybe(compute_backend_select(target=Target.REGISTRATION), native_functions)), + list(mapMaybe(ComputeBackendSelect(Target.REGISTRATION), native_functions)), }) cpu_fm.write('Declarations.yaml', lambda: format_yaml([compute_declaration_yaml(f) for f in native_functions])) diff --git a/tools/codegen/model.py b/tools/codegen/model.py index 9d8e2e73693b..3d084d5edb32 100644 --- a/tools/codegen/model.py +++ b/tools/codegen/model.py @@ -391,28 +391,6 @@ def __post_init__(self) -> None: '_foreach_div_.List', '_foreach_exp_', '_foreach_sqrt_', - '_foreach_abs_', - '_foreach_acos_', - '_foreach_asin_', - '_foreach_atan_', - '_foreach_ceil_', - '_foreach_cos_', - '_foreach_cosh_', - '_foreach_erf_', - '_foreach_erfc_', - '_foreach_expm1_', - '_foreach_floor_', - '_foreach_log_', - '_foreach_log10_', - '_foreach_log1p_', - '_foreach_log2_', - '_foreach_neg_', - '_foreach_tan_', - '_foreach_tanh_', - '_foreach_sin_', - '_foreach_sinh_', - '_foreach_round_', - '_foreach_lgamma_', '_foreach_addcmul_.Scalar', '_foreach_addcdiv_.Scalar', '_foreach_addcmul_.ScalarList', diff --git a/torch/autograd/profiler.py b/torch/autograd/profiler.py index eba7368cb03e..ba7d44814421 100644 --- a/torch/autograd/profiler.py +++ b/torch/autograd/profiler.py @@ -151,7 +151,7 @@ def self_cpu_time_total(self): def cpu_children_populated(self): return self._cpu_children_populated - def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only=False): + def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): """Prints an EventList as a nicely formatted table. Arguments: @@ -173,6 +173,7 @@ def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only= self, sort_by=sort_by, row_limit=row_limit, + max_src_column_width=max_src_column_width, header=header, use_cuda=self._use_cuda, profile_memory=self._profile_memory, @@ -420,11 +421,11 @@ def _check_finish(self): raise RuntimeError("can't export a trace that didn't finish running") self.function_events.populate_cpu_children() - def table(self, sort_by=None, row_limit=100, header=None, top_level_events_only=False): + def table(self, sort_by=None, row_limit=100, max_src_column_width=75, header=None, top_level_events_only=False): self._check_finish() assert self.function_events is not None return self.function_events.table( - sort_by=sort_by, row_limit=row_limit, header=header, + sort_by=sort_by, row_limit=row_limit, max_src_column_width=max_src_column_width, header=header, top_level_events_only=top_level_events_only ) table.__doc__ = EventList.table.__doc__ @@ -1165,6 +1166,7 @@ def build_table( sort_by=None, header=None, row_limit=100, + max_src_column_width=75, use_cuda=True, profile_memory=False, top_level_events_only=False): @@ -1195,7 +1197,7 @@ def build_table( has_stack = len(stacks) > 0 if has_stack: src_column_width = max([max([len(entry) for entry in stack]) for stack in stacks]) + 4 - src_column_width = min(src_column_width, 75) + src_column_width = min(src_column_width, max_src_column_width) headers = [ 'Name', diff --git a/torch/csrc/distributed/c10d/init.cpp b/torch/csrc/distributed/c10d/init.cpp index d9ddf35ee1df..e9d8f618eb21 100644 --- a/torch/csrc/distributed/c10d/init.cpp +++ b/torch/csrc/distributed/c10d/init.cpp @@ -1,5 +1,6 @@ #include +#include #include #ifndef _WIN32 #include @@ -59,6 +60,8 @@ constexpr auto kDeprecationWarning = "{} API is being deprecated, please ping " "https://github.com/pytorch/pytorch/issues/46291 " "if you see this warning"; +template +using intrusive_ptr_class_ = py::class_>; // PythonStore is a pybind11 trampoline class to allow a Python // class to inherit from c10d.Store and implement its interface. @@ -339,7 +342,7 @@ They are used in specifying strategies for reduction collectives, e.g., .def_readwrite("timeout", &::c10d::AllToAllOptions::timeout); auto store = - py::class_<::c10d::Store, std::shared_ptr<::c10d::Store>, PythonStore>( + py::class_<::c10d::Store, c10::intrusive_ptr<::c10d::Store>, PythonStore>( module, "Store", R"( @@ -543,7 +546,7 @@ Example:: >>> store.wait(["bad_key"], timedelta(seconds=10)) )"); - shared_ptr_class_<::c10d::FileStore>( + intrusive_ptr_class_<::c10d::FileStore>( module, "FileStore", store, @@ -566,7 +569,7 @@ Example:: .def(py::init()); #ifndef _WIN32 - shared_ptr_class_<::c10d::HashStore>( + intrusive_ptr_class_<::c10d::HashStore>( module, "HashStore", store, @@ -583,7 +586,7 @@ Example:: )") .def(py::init<>()); - shared_ptr_class_<::c10d::TCPStore>( + intrusive_ptr_class_<::c10d::TCPStore>( module, "TCPStore", store, @@ -623,7 +626,7 @@ Example:: std::chrono::milliseconds(::c10d::Store::kDefaultTimeout)); #endif - shared_ptr_class_<::c10d::PrefixStore>( + intrusive_ptr_class_<::c10d::PrefixStore>( module, "PrefixStore", store, @@ -636,7 +639,7 @@ that adds a prefix to each key inserted to the store. prefix (str): The prefix string that is prepended to each key before being inserted into the store. store (torch.distributed.store): A store object that forms the underlying key-value store. )") - .def(py::init>()); + .def(py::init>()); auto processGroup = shared_ptr_class_<::c10d::ProcessGroup>(module, "ProcessGroup") @@ -949,13 +952,13 @@ that adds a prefix to each key inserted to the store. processGroupGloo .def( py::init< - const std::shared_ptr<::c10d::Store>&, + const c10::intrusive_ptr<::c10d::Store>&, int, int, ::c10d::ProcessGroupGloo::Options>(), py::call_guard()) .def( - py::init([](const std::shared_ptr<::c10d::Store>& store, + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, int rank, int size, std::chrono::milliseconds timeout) { @@ -994,13 +997,13 @@ that adds a prefix to each key inserted to the store. module, "ProcessGroupNCCL", processGroup) .def( py::init< - const std::shared_ptr<::c10d::Store>&, + const c10::intrusive_ptr<::c10d::Store>&, int, int, ::c10d::ProcessGroupNCCL::Options>(), py::call_guard()) .def( - py::init([](const std::shared_ptr<::c10d::Store>& store, + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, int rank, int size, const std::chrono::milliseconds& timeout) { @@ -1045,7 +1048,7 @@ that adds a prefix to each key inserted to the store. py::call_guard()); #endif - shared_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work") + intrusive_ptr_class_<::c10d::ProcessGroup::Work>(module, "Work") .def("is_completed", &::c10d::ProcessGroup::Work::isCompleted) .def( "is_success", @@ -1165,7 +1168,7 @@ that adds a prefix to each key inserted to the store. // Python side of the world. Calling Python functions on a Python object // completely bypasses pybind11. We need to test that the overloaded // functions call into Python and behave like we expect. - [](std::shared_ptr<::c10d::Store> store) { + [](c10::intrusive_ptr<::c10d::Store> store) { auto add = [&store](const std::string& key, int64_t value) { store->add(key, value); }; diff --git a/torch/csrc/distributed/rpc/init.cpp b/torch/csrc/distributed/rpc/init.cpp index 1d82a619ed7e..81af4abebd5f 100644 --- a/torch/csrc/distributed/rpc/init.cpp +++ b/torch/csrc/distributed/rpc/init.cpp @@ -576,7 +576,7 @@ PyObject* rpc_init(PyObject* _unused, PyObject* noargs) { shared_ptr_class_(module, "TensorPipeAgent", rpcAgent) .def( - py::init([](const std::shared_ptr<::c10d::Store>& store, + py::init([](const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, int worldSize, diff --git a/torch/csrc/distributed/rpc/process_group_agent.cpp b/torch/csrc/distributed/rpc/process_group_agent.cpp index 2f29adc8f0c4..13e685b8fe74 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.cpp +++ b/torch/csrc/distributed/rpc/process_group_agent.cpp @@ -398,7 +398,7 @@ void ProcessGroupAgent::handleSend(const SendWork& work) { // ProcessGroup is not thread-safe when sending with the same tag, // hence the lock - std::vector> pendingSends; + std::vector> pendingSends; const auto dst = work.to_.id_; // NOLINTNEXTLINE(cppcoreguidelines-pro-type-const-cast) diff --git a/torch/csrc/distributed/rpc/process_group_agent.h b/torch/csrc/distributed/rpc/process_group_agent.h index 1bc8db9ebf20..70fb1b40244d 100644 --- a/torch/csrc/distributed/rpc/process_group_agent.h +++ b/torch/csrc/distributed/rpc/process_group_agent.h @@ -230,14 +230,14 @@ class TORCH_API ProcessGroupAgent : public RpcAgent { // Lock and shared ptr to currently pending work, set in listenloop() and // interruptible in shutdown(). std::mutex recvWorkMutex_; - std::shared_ptr recvWork_; + c10::intrusive_ptr recvWork_; // Map of dst rank to current oustanding sends that we are waiting on. In the // case of a call to ::shutdown() while we are still waiting on these sends, // the pending sends contained in this map will be aborted, allowing the // waiting thread to be unblocked. std::unordered_map< worker_id_t, - std::set>> + std::set>> currentPendingSends_; // Lock to serialize access to the above map. std::mutex pendingSendMutex_; diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp index 6bf65f4c2628..eff1e7ebdf21 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.cpp +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.cpp @@ -220,7 +220,7 @@ void TensorPipeAgent::collectNames() { } TensorPipeAgent::TensorPipeAgent( - const std::shared_ptr<::c10d::Store>& store, + const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, int worldSize, diff --git a/torch/csrc/distributed/rpc/tensorpipe_agent.h b/torch/csrc/distributed/rpc/tensorpipe_agent.h index b4a500de65be..b8c9a8c64e5c 100644 --- a/torch/csrc/distributed/rpc/tensorpipe_agent.h +++ b/torch/csrc/distributed/rpc/tensorpipe_agent.h @@ -141,7 +141,7 @@ struct AggregatedNetworkData { class TensorPipeAgent : public RpcAgent { public: TensorPipeAgent( - const std::shared_ptr<::c10d::Store>& store, + const c10::intrusive_ptr<::c10d::Store>& store, std::string selfName, worker_id_t selfId, int worldSize, diff --git a/torch/csrc/jit/passes/freeze_module.cpp b/torch/csrc/jit/passes/freeze_module.cpp index 76b6f1d234ba..f4060a6c0e74 100644 --- a/torch/csrc/jit/passes/freeze_module.cpp +++ b/torch/csrc/jit/passes/freeze_module.cpp @@ -2,6 +2,7 @@ #include #include +#include #include #include @@ -75,6 +76,7 @@ class AttributePropagator { void run() { auto applyInline = [](std::shared_ptr& subgraph) { Inline(*subgraph); + ClearProfilingInformation(subgraph); }; auto applyOptimizations = [](std::shared_ptr& subgraph) { runOptimization(subgraph, /* unroll? */ false); diff --git a/torch/csrc/jit/python/pybind_utils.h b/torch/csrc/jit/python/pybind_utils.h index 15c1cdd272b2..7630ad320adf 100644 --- a/torch/csrc/jit/python/pybind_utils.h +++ b/torch/csrc/jit/python/pybind_utils.h @@ -796,8 +796,7 @@ inline IValue toIValue( return c10::ivalue::ConcretePyObjectHolder::create(obj); } case TypeKind::CapsuleType: { - return IValue::make_capsule( - py::cast>(obj)); + return IValue::make_capsule(py::cast(obj).obj_ptr); } case TypeKind::FutureType: { return obj.cast>()->fut; @@ -1002,7 +1001,7 @@ inline py::object toPyObject(IValue ivalue) { // PyObject return py::reinterpret_borrow(ivalue.toPyObject()); } else if (ivalue.isCapsule()) { - return py::cast(ivalue.toCapsule()); + return py::cast(c10::Capsule(ivalue.toCapsule())); } else if (ivalue.isFuture()) { return py::cast(std::make_shared(ivalue.toFuture())); } else if (ivalue.isEnum()) { diff --git a/torch/csrc/jit/python/script_init.cpp b/torch/csrc/jit/python/script_init.cpp index a99f7469ac65..7c571384e481 100644 --- a/torch/csrc/jit/python/script_init.cpp +++ b/torch/csrc/jit/python/script_init.cpp @@ -714,7 +714,7 @@ void initJitScriptBindings(PyObject* module) { auto m = py::handle(module).cast(); // NOLINTNEXTLINE(bugprone-unused-raii) - py::class_>(m, "Capsule"); + py::class_(m, "Capsule"); auto object_class = py::class_(m, "ScriptObject") diff --git a/torch/csrc/utils/pybind.h b/torch/csrc/utils/pybind.h index 7df518f404c5..1447508535e5 100644 --- a/torch/csrc/utils/pybind.h +++ b/torch/csrc/utils/pybind.h @@ -17,6 +17,11 @@ namespace py = pybind11; +// This makes intrusive_ptr to be available as a custom pybind11 holder type, +// see +// https://pybind11.readthedocs.io/en/stable/advanced/smart_ptrs.html#custom-smart-pointers +PYBIND11_DECLARE_HOLDER_TYPE(T, c10::intrusive_ptr, true); + namespace pybind11 { namespace detail { // torch.autograd.Variable <-> at::Tensor conversions (without unwrapping) diff --git a/torch/csrc/utils/python_arg_parser.cpp b/torch/csrc/utils/python_arg_parser.cpp index ff94b1f5ceca..950e7d9fb82d 100644 --- a/torch/csrc/utils/python_arg_parser.cpp +++ b/torch/csrc/utils/python_arg_parser.cpp @@ -139,7 +139,7 @@ FunctionParameter::FunctionParameter(const std::string& fmt, bool keyword_only) auto handle_torch_function_getter(THPVariable* self, const std::string& property_name) -> PyObject* { py::object torch_api = PyObject_FastGetAttrString(THPVariableClass, (char*)property_name.c_str()); std::string module_name = "torch.Tensor." + property_name; - return handle_torch_function((PyObject *)self, "__get__", nullptr, torch_api.ptr(), module_name); + return handle_torch_function((PyObject *)self, "__get__", nullptr, nullptr, torch_api.ptr(), module_name); } auto handle_torch_function_setter(THPVariable* self, const std::string& property_name, PyObject* value) -> int { @@ -148,10 +148,10 @@ auto handle_torch_function_setter(THPVariable* self, const std::string& property if (value != nullptr) { py::tuple args_ = py::make_tuple(py::handle(value)); - handle_torch_function((PyObject *)self, "__set__", args_.ptr(), torch_api.ptr(), module_name); + handle_torch_function((PyObject *)self, "__set__", args_.ptr(), nullptr, torch_api.ptr(), module_name); } else { - handle_torch_function((PyObject *)self, "__delete__", nullptr, torch_api.ptr(), module_name); + handle_torch_function((PyObject *)self, "__delete__", nullptr, nullptr, torch_api.ptr(), module_name); } return 0; } @@ -175,13 +175,13 @@ auto combine_self_args(PyObject *self, PyObject *args) -> py::tuple { return args_; } -auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args, PyObject* torch_api, const std::string& module_name) -> PyObject* { +auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args, PyObject* kwargs, PyObject* torch_api, const std::string& module_name) -> PyObject* { py::object torch_api_function = PyObject_FastGetAttrString(torch_api, (char*)func_name.c_str()); TORCH_INTERNAL_ASSERT(torch_api_function.ptr() != nullptr, "torch API function must exist"); py::tuple args_ = combine_self_args(self, args); py::tuple py_types = py::make_tuple(py::handle(PyObject_Type(self))); py::object torch_function = PyObject_FastGetAttrString(self, "__torch_function__"); - py::object ret = py::reinterpret_steal(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function.ptr(), py_types.ptr(), args_.ptr(), NULL)); + py::object ret = py::reinterpret_steal(PyObject_CallFunctionObjArgs(torch_function.ptr(), torch_api_function.ptr(), py_types.ptr(), args_.ptr(), kwargs)); if (ret.ptr() == nullptr) { // if an exception occurred in a user's implementation of // __torch_function__, throw it diff --git a/torch/csrc/utils/python_arg_parser.h b/torch/csrc/utils/python_arg_parser.h index 773486f30ee1..b0b81a9517da 100644 --- a/torch/csrc/utils/python_arg_parser.h +++ b/torch/csrc/utils/python_arg_parser.h @@ -820,8 +820,8 @@ auto handle_torch_function(PythonArgs &r, PyObject* self, PyObject* args, PyObje // Used for functions which needs to parse python args. auto handle_torch_function(PythonArgs &r, PyObject* args, PyObject* kwargs, PyObject* torch_api, const char* module_name) -> PyObject*; -// Used for functions that accept no keyword arguments and have no argument parsing -auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args=nullptr, PyObject* torch_api=THPVariableClass, const std::string& module_name="torch.Tensor") -> PyObject*; +// Used for functions that have no argument parsing. +auto handle_torch_function(PyObject* self, const std::string& func_name, PyObject* args=nullptr, PyObject* kwargs=nullptr, PyObject* torch_api=THPVariableClass, const std::string& module_name="torch.Tensor") -> PyObject*; // Used for functions created in C++, e.g., C++ custom op, which doesn't use PythonArgParser to get overloaded_args. auto handle_torch_function_no_python_arg_parser(const std::vector &overloaded_args, PyObject* args, PyObject* kwargs, const char* func_name, PyObject* torch_api_function, const char* module_name) -> PyObject*; diff --git a/torch/functional.py b/torch/functional.py index 3781b73a178e..e26b4c1b4125 100644 --- a/torch/functional.py +++ b/torch/functional.py @@ -262,76 +262,102 @@ def lu_unpack(LU_data, LU_pivots, unpack_data=True, unpack_pivots=True): def einsum(equation, *operands): r"""einsum(equation, *operands) -> Tensor -This function provides a way of computing multilinear expressions (i.e. sums of products) using the -Einstein summation convention. - -Args: - equation (string): The equation is given in terms of lower case letters (indices) to be associated - with each dimension of the operands and result. The left hand side lists the operands - dimensions, separated by commas. There should be one index letter per tensor dimension. - The right hand side follows after `->` and gives the indices for the output. - If the `->` and right hand side are omitted, it implicitly defined as the alphabetically - sorted list of all indices appearing exactly once in the left hand side. - The indices not apprearing in the output are summed over after multiplying the operands - entries. - If an index appears several times for the same operand, a diagonal is taken. - Ellipses `...` represent a fixed number of dimensions. If the right hand side is inferred, - the ellipsis dimensions are at the beginning of the output. - operands (Tensor): The operands to compute the Einstein sum of. - -.. note:: - - This function does not optimize the given expression, so a different formula for the same computation may - run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) - can optimize the formula for you. - -Examples:: - - >>> x = torch.randn(5) - >>> y = torch.randn(4) - >>> torch.einsum('i,j->ij', x, y) # outer product - tensor([[-0.0570, -0.0286, -0.0231, 0.0197], - [ 1.2616, 0.6335, 0.5113, -0.4351], - [ 1.4452, 0.7257, 0.5857, -0.4984], - [-0.4647, -0.2333, -0.1883, 0.1603], - [-1.1130, -0.5588, -0.4510, 0.3838]]) - - - >>> A = torch.randn(3,5,4) - >>> l = torch.randn(2,5) - >>> r = torch.randn(2,4) - >>> torch.einsum('bn,anm,bm->ba', l, A, r) # compare torch.nn.functional.bilinear - tensor([[-0.3430, -5.2405, 0.4494], - [ 0.3311, 5.5201, -3.0356]]) - - - >>> As = torch.randn(3,2,5) - >>> Bs = torch.randn(3,5,4) - >>> torch.einsum('bij,bjk->bik', As, Bs) # batch matrix multiplication - tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], - [-1.6706, -0.8097, -0.8025, -2.1183]], - - [[ 4.2239, 0.3107, -0.5756, -0.2354], - [-1.4558, -0.3460, 1.5087, -0.8530]], - - [[ 2.8153, 1.8787, -4.3839, -1.2112], - [ 0.3728, -2.1131, 0.0921, 0.8305]]]) - - >>> A = torch.randn(3, 3) - >>> torch.einsum('ii->i', A) # diagonal - tensor([-0.7825, 0.8291, -0.1936]) - - >>> A = torch.randn(4, 3, 3) - >>> torch.einsum('...ii->...i', A) # batch diagonal - tensor([[-1.0864, 0.7292, 0.0569], - [-0.9725, -1.0270, 0.6493], - [ 0.5832, -1.1716, -1.5084], - [ 0.4041, -1.1690, 0.8570]]) - - >>> A = torch.randn(2, 3, 4, 5) - >>> torch.einsum('...ij->...ji', A).shape # batch permute - torch.Size([2, 3, 5, 4]) -""" + Sums the product of the elements of the input :attr:`operands` along dimensions specified using a notation + based on the Einstein summation convention. + + Einsum allows computing many common multi-dimensional linear algebraic array operations by representing them + in a short-hand format based on the Einstein summation convention, given by :attr:`equation`. The details of + this format are described below, but the general idea is to label every dimension of the input :attr:`operands` + with some subscript and define which subscripts are part of the output. The output is then computed by summing + the product of the elements of the :attr:`operands` along the dimensions whose subscripts are not part of the + output. For example, matrix multiplication can be computed using einsum as `torch.einsum("ij,jk->ik", A, B)`. + Here, j is the summation subscript and i and k the output subscripts (see section below for more details on why). + + Equation: + + The :attr:`equation` string specifies the subscripts (lower case letters `['a', 'z']`) for each dimension of + the input :attr:`operands` in the same order as the dimensions, separating subcripts for each operand by a + comma (','), e.g. `'ij,jk'` specify subscripts for two 2D operands. The dimensions labeled with the same subscript + must be broadcastable, that is, their size must either match or be `1`. The exception is if a subscript is + repeated for the same input operand, in which case the dimensions labeled with this subscript for this operand + must match in size and the operand will be replaced by its diagonal along these dimensions. The subscripts that + appear exactly once in the :attr:`equation` will be part of the output, sorted in increasing alphabetical order. + The output is computed by multiplying the input :attr:`operands` element-wise, with their dimensions aligned based + on the subscripts, and then summing out the dimensions whose subscripts are not part of the output. + + Optionally, the output subscripts can be explictly defined by adding an arrow ('->') at the end of the equation + followed by the subscripts for the output. For instance, the following equation computes the transpose of a + matrix multiplication: 'ij,jk->ki'. The output subscripts must appear at least once for some input operand and + at most once for the output. + + Ellipsis ('...') can be used in place of subscripts to broadcast the dimensions covered by the ellipsis. + Each input operand may contain at most one ellipsis which will cover the dimensions not covered by subscripts, + e.g. for an input operand with 5 dimensions, the ellipsis in the equation `'ab...c'` cover the third and fourth + dimensions. The ellipsis does not need to cover the same number of dimensions across the :attr:`operands` but the + 'shape' of the ellipsis (the size of the dimensions covered by them) must be broadcastable. In implicit mode, + the ellipsis will come first in the output. In explicit mode, if an ellipses covers at least one dimension then + it must appear in the output since the dimensions under the ellipsis cannot be summed over. e.g. the following + equation implements batch matrix multiplication `'...ij,...jk->...ik'`. + + A few final notes: the equation may contain whitespaces between the different elements (subscripts, ellipsis, + arrow and comma) but something like `'. . .'` is not valid. An empty string `''` is valid for scalar operands. + + .. note:: + + This function does not optimize the given expression, so a different formula for the same computation may + run faster or consume less memory. Projects like opt_einsum (https://optimized-einsum.readthedocs.io/en/stable/) + can optimize the formula for you. + + Args: + equation (string): The subscripts for the Einstein summation. + operands (Tensor): The operands to compute the Einstein sum of. + + Examples:: + + # trace + >>> torch.einsum('ii', torch.randn(4, 4)) + tensor(-1.2104) + + # diagonal + >>> torch.einsum('ii->i', torch.randn(4, 4)) + tensor([-0.1034, 0.7952, -0.2433, 0.4545]) + + # outer product + >>> x = torch.randn(5) + >>> y = torch.randn(4) + >>> torch.einsum('i,j->ij', x, y) + tensor([[ 0.1156, -0.2897, -0.3918, 0.4963], + [-0.3744, 0.9381, 1.2685, -1.6070], + [ 0.7208, -1.8058, -2.4419, 3.0936], + [ 0.1713, -0.4291, -0.5802, 0.7350], + [ 0.5704, -1.4290, -1.9323, 2.4480]]) + + # batch matrix multiplication + >>> As = torch.randn(3,2,5) + >>> Bs = torch.randn(3,5,4) + >>> torch.einsum('bij,bjk->bik', As, Bs) + tensor([[[-1.0564, -1.5904, 3.2023, 3.1271], + [-1.6706, -0.8097, -0.8025, -2.1183]], + + [[ 4.2239, 0.3107, -0.5756, -0.2354], + [-1.4558, -0.3460, 1.5087, -0.8530]], + + [[ 2.8153, 1.8787, -4.3839, -1.2112], + [ 0.3728, -2.1131, 0.0921, 0.8305]]]) + + # batch permute + >>> A = torch.randn(2, 3, 4, 5) + >>> torch.einsum('...ij->...ji', A).shape + torch.Size([2, 3, 5, 4]) + + # equivalent to torch.nn.functional.bilinear + >>> A = torch.randn(3,5,4) + >>> l = torch.randn(2,5) + >>> r = torch.randn(2,4) + >>> torch.einsum('bn,anm,bm->ba', l, A, r) + tensor([[-0.3430, -5.2405, 0.4494], + [ 0.3311, 5.5201, -3.0356]]) + """ if not torch.jit.is_scripting(): if any(type(t) is not Tensor for t in operands) and has_torch_function(operands): return handle_torch_function(einsum, operands, equation, *operands) diff --git a/torch/fx/__init__.py b/torch/fx/__init__.py index c7fbd6fbf0ea..f3804c515612 100644 --- a/torch/fx/__init__.py +++ b/torch/fx/__init__.py @@ -44,7 +44,8 @@ def forward(self, x): The semantics are as follows: - `placeholder` represents a function input. The `name` attribute specifies the name this value will take on. - `target` is similarly the name of the argument. `args` and `kwargs` are don't-care. Placeholders correspond to + `target` is similarly the name of the argument. `args` holds either: 1) nothing, or 2) a single argument + denoting the default parameter of the function input. `kwargs` is don't-care. Placeholders correspond to the function parameters (e.g. `x`) in the graph printout. - `get_attr` retrieves a parameter from the module hierarchy. `name` is similarly the name the result of the fetch is assigned to. `target` is the fully-qualified name of the parameter's position in the module hierarchy. diff --git a/torch/fx/experimental/GraphManipulation.py b/torch/fx/experimental/GraphManipulation.py index 0e3b7b566ac0..7bd303f55d04 100644 --- a/torch/fx/experimental/GraphManipulation.py +++ b/torch/fx/experimental/GraphManipulation.py @@ -1,9 +1,12 @@ -from typing import Dict, List, NamedTuple -from torch.fx.graph_module import GraphModule -from torch.fx.node import Node, Target, map_arg -from torch.fx.graph import Graph +import json +from typing import Dict, List, NamedTuple, Any + import torch from torch.fx.experimental.shape_prop import ShapeProp +from torch.fx.graph import Graph, get_qualified_name +from torch.fx.graph_module import GraphModule +from torch.fx.node import Node, Target, map_arg + def replace_target_nodes_with( fx_module: GraphModule, @@ -15,22 +18,26 @@ def replace_target_nodes_with( """Modifies all nodes in fx_module.graph.nodes which match the specified op code and target, and updates them to match the new op code and target""" new_graph = Graph() - val_map : Dict[Node, Node] = {} + val_map: Dict[Node, Node] = {} for node in fx_module.graph.nodes: if node.op == old_op and node.target == old_target: args = map_arg(node.args, lambda n: val_map[n]) kwargs = map_arg(node.kwargs, lambda n: val_map[n]) assert isinstance(args, tuple) assert isinstance(kwargs, dict) - val_map[node] = new_graph.create_node(new_op, new_target, args, kwargs, node.name) + val_map[node] = new_graph.create_node( + new_op, new_target, args, kwargs, node.name + ) else: - val_map[node] = new_graph.node_copy(node, lambda n : val_map[n]) + val_map[node] = new_graph.node_copy(node, lambda n: val_map[n]) fx_module.graph = new_graph + class size_bytes(NamedTuple): output_size: int total_size: int + def get_size_of_all_nodes(fx_module: GraphModule, args: List[torch.Tensor]) -> None: """Given a fx graph module, update each node with its total size (weights + bias + output) and its output_size(output). For a non-module node, the total size is the output size. @@ -40,19 +47,20 @@ def get_size_of_all_nodes(fx_module: GraphModule, args: List[torch.Tensor]) -> N # Calculate the total size of the whole fx graph total_size_of_graph = 0.0 for node in fx_module.graph.nodes: - if node.op == 'output': + if node.op == "output": break node.size_bytes = get_size_of_node(fx_module, node) return + def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes: """Given a node with node.dtype and node.shape, return its total size and its output size. - total_size = weights + bias + output_size + total_size = weights + bias + output_size """ # Total num of elements total_num_of_elems = 0 # For a module, conside all parameters - if node.op == 'call_module': + if node.op == "call_module": submodule_dict = dict(fx_module.named_modules()) submodule = submodule_dict[node.target] parameters = submodule.named_parameters() @@ -61,18 +69,165 @@ def get_size_of_node(fx_module: GraphModule, node: Node) -> size_bytes: total_num_of_elems += p.numel() # Don't forget the output size # node.shape is the shape of this node's output - shape = getattr(node, 'shape', None) + shape = getattr(node, "shape", None) if shape: output_elem = shape.numel() else: - raise RuntimeError('Node has no shape attr') + raise RuntimeError("Node has no shape attr") total_num_of_elems += output_elem size_per_elem_bytes = 0 - dtype = getattr(node, 'dtype', None) + dtype = getattr(node, "dtype", None) if dtype: size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size() else: - raise RuntimeError('Node has no dtype attr') + raise RuntimeError("Node has no dtype attr") total_size = size_per_elem_bytes * total_num_of_elems output_size = size_per_elem_bytes * output_elem return size_bytes(output_size, total_size) + + +def serialize_shape(shape: torch.Size) -> str: + return str(list(shape)) + + +def serialize_tensor_quantization(tensor: torch.Tensor) -> Dict[str, Any]: + scheme = {} # type: Dict[str, Any] + if tensor.is_quantized: + scheme["q_scheme"] = str(tensor.qscheme()) + if tensor.qscheme() in {torch.per_tensor_affine, torch.per_tensor_symmetric}: + scheme["q_scale"] = tensor.q_scale() + scheme["q_zero_pont"] = tensor.q_zero_point() + if tensor.qscheme() in { + torch.per_channel_affine, + torch.per_channel_affine_float_qparams, + torch.per_channel_symmetric, + }: + scheme["q_per_channel_scales"] = tensor.q_per_channel_scales().tolist() + scheme[ + "q_per_channel_zero_points" + ] = tensor.q_per_channel_zero_points().tolist() + scheme["q_per_channel_axis"] = tensor.q_per_channel_axis() + + return scheme + + +def serialize_weight(tensor: torch.Tensor) -> Dict: + weight = {} # type: Dict[str, Any] + weight["dtype"] = str(tensor.dtype) + weight["is_quantized"] = tensor.is_quantized + if tensor.is_quantized: + weight["quantized_type"] = serialize_tensor_quantization(tensor) + weight["shape"] = serialize_shape(tensor.shape) + return weight + + +def serialize_module(fx_module: GraphModule, weights: Dict, name_prefix="") -> Dict: + """Recursively Serializes a graph module (fx_module) to a dictionary which is later exported to JSON. + It also adds all weights the provided weights dictionary by qualified_name. + Dictionary Schema: + MODULE + { + modules: {module_name: MODULE], + nodes: [NODE], + weights {qualified_name: WEIGHT}, + } + NODE + { + shape: [], + dtype: dtype, + target: target, + op_code: op_code, + name: name, + args: [], + kwargs: {} + } + WEIGHT + { + dtype: dtype, + is_quantized: bool, + shape: [], + quantization_info: QUANTIZATION + } + QUANTIZATION + { + qscheme: qscheme, + q_scale: float, + q_zero_point: float, + q_per_channel_scales, [], + q_per_channel_zero_points: [], + q_per_channel_axis, int + } + """ + serialized_dict = {} # type: Dict[str, Any] + serialized_dict["modules"] = {} + serialized_dict["weights"] = {} + serialized_dict["nodes"] = [] + parameters = fx_module.named_parameters() + for name, p in parameters: + if isinstance(p, torch.Tensor): + weight = serialize_weight(p) + prefix = f"{name_prefix}." if name_prefix else "" + serialized_dict["weights"][prefix + name] = weight + weights[prefix + name] = p + for node in fx_module.graph.nodes: + node_rep = {} # type: Dict[str, Any] + # Get shape/type info, currently not needed for call_module. + if node.op != "call_module": + shape = getattr(node, "shape", None) + if shape: + node_rep["shape"] = serialize_shape(shape) + else: + raise RuntimeError( + "Node has no shape attr, this is likely because shape propagation has not been run on this Graph." + ) + dtype = getattr(node, "dtype", None) + if dtype: + node_rep["dtype"] = str(dtype) + else: + raise RuntimeError( + "Node has no dtype attr, this is likely because shape propagation has not been run on this Graph." + ) + + # Recurse down into any submodules we are calling. + if node.op == "call_module": + submodules = dict(fx_module.named_modules()) + if isinstance(submodules[node.target], GraphModule): + serialized_module = serialize_module( + getattr(fx_module, node.target), weights, node.target + ) + serialized_dict["modules"][node.target] = serialized_module + + if node.op == "call_function": + node_rep["target"] = get_qualified_name(node.target) + else: + node_rep["target"] = str(node.target) + + # Make sure we capture all constants. + if node.op == "get_attr": + target = getattr(fx_module, node.target) + prefix = f"{name_prefix}." if name_prefix else "" + qualname = prefix + node.target + if isinstance(target, torch.Tensor) and qualname not in weights: + weight = serialize_weight(target) + serialized_dict["weights"][prefix + node.target] = weight + weights[prefix + node.target] = target + + node_rep["op_code"] = node.op + node_rep["name"] = node.name + node_rep["args"] = map_arg( + node.args, lambda arg: {"is_node": True, "name": str(arg)} + ) + node_rep["kwargs"] = map_arg( + node.kwargs, lambda arg: {"is_node": True, "name": str(arg)} + ) + serialized_dict["nodes"] += [node_rep] + + return serialized_dict + + +class AcceleratedGraphModule: + def __init__(self, fx_module: GraphModule): + """Creates the needed data structures to pass to the glow runtime""" + self.weights = {} # type: Dict[str, Any] + self.serialized_graph = serialize_module(fx_module, self.weights) + self.serialized_graph_json = json.dumps(self.serialized_graph, indent=4) diff --git a/torch/fx/graph.py b/torch/fx/graph.py index 8a99f772c4c1..dd07ff7a508e 100644 --- a/torch/fx/graph.py +++ b/torch/fx/graph.py @@ -16,7 +16,7 @@ def _is_magic(x: str) -> bool: def snake_case(s: str) -> str: return ''.join(['_' + i.lower() if i.isupper() else i for i in s]).lstrip('_') -def _qualified_name(func: Callable[..., Any]) -> str: +def get_qualified_name(func: Callable[..., Any]) -> str: # things like getattr just appear in builtins if getattr(builtins, func.__name__, None) is func: return func.__name__ @@ -344,7 +344,8 @@ def type_repr(o : Any): if node.op == 'placeholder': assert isinstance(node.target, str) maybe_type_annotation = '' if node.type is None else f' : {type_repr(node.type)}' - free_vars.append(f'{node.target}{maybe_type_annotation}') + maybe_default_arg = '' if not node.args else f' = {repr(node.args[0])}' + free_vars.append(f'{node.target}{maybe_type_annotation}{maybe_default_arg}') raw_name = node.target.replace('*', '') if raw_name != node.name: body.append(f'{node.name} = {raw_name}\n') @@ -362,7 +363,7 @@ def type_repr(o : Any): assert isinstance(node.args, tuple) body.append(f'{node.name} = {magic_methods[node.target.__name__].format(*(repr(a) for a in node.args))}\n') continue - qualified_name = _qualified_name(node.target) + qualified_name = get_qualified_name(node.target) register_modules_used(qualified_name) if qualified_name == 'getattr' and \ isinstance(node.args, tuple) and \ @@ -384,7 +385,7 @@ def type_repr(o : Any): elif node.op == 'output': if node.type is not None: maybe_return_annotation = f" -> {type_repr(node.type)}" - body.append(f'return {node.args[0]}') + body.append(f'return {repr(node.args[0])}') continue raise NotImplementedError(f'node: {node.op} {node.target}') diff --git a/torch/fx/proxy.py b/torch/fx/proxy.py index d1672d332f14..317e039223a0 100644 --- a/torch/fx/proxy.py +++ b/torch/fx/proxy.py @@ -27,6 +27,15 @@ def proxy(self, node: Node) -> 'Proxy': def create_proxy(self, kind: str, target: Target, args: Tuple[Any, ...], kwargs: Dict[str, Any], name: Optional[str] = None, type_expr : Optional[Any] = None): + ''' + Create a Node from the given arguments, then return the Node + wrapped in a Proxy object. + + If kind = 'placeholder', then we're creating a Node that + represents the parameter of a function. If we need to encode + a default parameter, we use the `args` tuple. `args` is + otherwise empty for `placeholder` Nodes. + ''' args_ = self.create_arg(args) kwargs_ = self.create_arg(kwargs) assert isinstance(args_, tuple) diff --git a/torch/fx/symbolic_trace.py b/torch/fx/symbolic_trace.py index 44f0ffba98e0..20566bb58e6e 100644 --- a/torch/fx/symbolic_trace.py +++ b/torch/fx/symbolic_trace.py @@ -125,8 +125,15 @@ def create_args_for_root(self, root_fn, is_module): next(names_iter) # skip self args.append(self.root) + sig = inspect.signature(fn_for_analysis) + def proxy_placeholder(name: str): - return self.create_proxy('placeholder', name, (), {}, + if name[0] == '*': + default = () # type: ignore + else: + param = sig.parameters[name] + default = () if param.default is inspect.Parameter.empty else (param.default,) # type: ignore + return self.create_proxy('placeholder', name, default, {}, type_expr=fn_for_analysis.__annotations__.get(name, None)) args.extend(proxy_placeholder(next(names_iter)) for _ in range(skip_arg_idx, total_args)) diff --git a/torch/jit/_script.py b/torch/jit/_script.py index d4f6f96c3da2..b1f6a8bb3571 100644 --- a/torch/jit/_script.py +++ b/torch/jit/_script.py @@ -741,6 +741,19 @@ class RecursiveScriptModule(ScriptModule): # type: ignore def __init__(self, arg=None): super().__init__() +def call_prepare_scriptable_func(obj): + if not isinstance(obj, torch.nn.Module): + return obj + obj = obj.__prepare_scriptable__() if hasattr(obj, '__prepare_scriptable__') else obj # type: ignore + for name in obj.__dict__: + sub_module = obj.__dict__.get(name) + if name == '_modules': + for k, v in sub_module.items(): + sub_module[k] = call_prepare_scriptable_func(v) + obj.__setattr__(name, sub_module) + elif isinstance(sub_module, torch.nn.Module) and not isinstance(sub_module, ScriptModule): + obj.__setattr__(name, call_prepare_scriptable_func(sub_module)) + return obj def script(obj, optimize=None, _frames_up=0, _rcb=None): r""" @@ -894,6 +907,7 @@ def forward(self, input): return obj if isinstance(obj, torch.nn.Module): + obj = call_prepare_scriptable_func(obj) return torch.jit._recursive.create_script_module( obj, torch.jit._recursive.infer_methods_to_compile ) diff --git a/torch/lib/c10d/PrefixStore.cpp b/torch/lib/c10d/PrefixStore.cpp index 5f9a3c9c21ec..6f71e422bd0e 100644 --- a/torch/lib/c10d/PrefixStore.cpp +++ b/torch/lib/c10d/PrefixStore.cpp @@ -4,7 +4,7 @@ namespace c10d { PrefixStore::PrefixStore( const std::string& prefix, - std::shared_ptr store) + c10::intrusive_ptr store) : prefix_(prefix), store_(store) {} std::string PrefixStore::joinKey(const std::string& key) { diff --git a/torch/lib/c10d/PrefixStore.hpp b/torch/lib/c10d/PrefixStore.hpp index cad7112fbd76..ec50b3b719bf 100644 --- a/torch/lib/c10d/PrefixStore.hpp +++ b/torch/lib/c10d/PrefixStore.hpp @@ -7,7 +7,9 @@ namespace c10d { class PrefixStore : public Store { public: - explicit PrefixStore(const std::string& prefix, std::shared_ptr store); + explicit PrefixStore( + const std::string& prefix, + c10::intrusive_ptr store); virtual ~PrefixStore(){}; @@ -31,7 +33,7 @@ class PrefixStore : public Store { protected: std::string prefix_; - std::shared_ptr store_; + c10::intrusive_ptr store_; std::string joinKey(const std::string& key); std::vector joinKeys(const std::vector& keys); diff --git a/torch/lib/c10d/ProcessGroup.cpp b/torch/lib/c10d/ProcessGroup.cpp index 3521ed42c840..1d0d451f21a9 100644 --- a/torch/lib/c10d/ProcessGroup.cpp +++ b/torch/lib/c10d/ProcessGroup.cpp @@ -164,7 +164,7 @@ ProcessGroup::~ProcessGroup() {} // This is introduced so that implementors of ProcessGroup would not need to // have this implmentation. -std::shared_ptr ProcessGroup::allgather_coalesced( +c10::intrusive_ptr ProcessGroup::allgather_coalesced( std::vector>& /* usused */, std::vector& /* usused */, const AllgatherOptions& /* usused */) { diff --git a/torch/lib/c10d/ProcessGroup.hpp b/torch/lib/c10d/ProcessGroup.hpp index 5e90dccc25c0..63996b516a06 100644 --- a/torch/lib/c10d/ProcessGroup.hpp +++ b/torch/lib/c10d/ProcessGroup.hpp @@ -70,12 +70,11 @@ bool isP2POp(OpType opType); // class ProcessGroup { public: - // Please do not use ProcessGroup::Work API, it is going away, to be // replaced by ivalue::Future. // Python binding for this class might change, please do not assume // this will be bound using pybind. - class Work { + class Work : public torch::CustomClassHolder { public: Work(int rank = -1, OpType opType = OpType::UNKNOWN, const char* profilingTitle = nullptr); @@ -171,25 +170,25 @@ class ProcessGroup { return size_; } - virtual std::shared_ptr broadcast( + virtual c10::intrusive_ptr broadcast( std::vector& data, const BroadcastOptions& opts = BroadcastOptions()) = 0; - virtual std::shared_ptr allreduce( + virtual c10::intrusive_ptr allreduce( std::vector& data, const AllreduceOptions& opts = AllreduceOptions()) = 0; // This will be moved out of ProcessGroup, do not add dependencies on this // function. - virtual std::shared_ptr allreduce_coalesced( + virtual c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) = 0; - virtual std::shared_ptr reduce( + virtual c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) = 0; - virtual std::shared_ptr allgather( + virtual c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) = 0; @@ -197,7 +196,7 @@ class ProcessGroup { // Gathers a single tensor inputBuffer into a single buffer outputBuffer that // is interpreted as a contigious collection of size inputBuffer * WORLD_SIZE. // For implementers of ProcessGroup API and advanced users only. - virtual std::shared_ptr allgather_base( + virtual c10::intrusive_ptr allgather_base( at::Tensor& outputBuffer, at::Tensor& inputBuffer, const AllgatherOptions& opts = AllgatherOptions()) = 0; @@ -206,27 +205,27 @@ class ProcessGroup { // * do not add dependencies on this function, // * do not implement it in your ProcessGroup, implement allgather_base // instead. - virtual std::shared_ptr allgather_coalesced( + virtual c10::intrusive_ptr allgather_coalesced( std::vector>& outputTensorLists, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()); - virtual std::shared_ptr gather( + virtual c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts = GatherOptions()) = 0; - virtual std::shared_ptr scatter( + virtual c10::intrusive_ptr scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) = 0; - virtual std::shared_ptr reduce_scatter( + virtual c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts = ReduceScatterOptions()) = 0; - virtual std::shared_ptr alltoall_base( + virtual c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, @@ -235,28 +234,28 @@ class ProcessGroup { throw std::runtime_error("ProcessGroup does not support alltoall"); } - virtual std::shared_ptr alltoall( + virtual c10::intrusive_ptr alltoall( std::vector& outputTensors, std::vector& inputTensors, const AllToAllOptions& opts = AllToAllOptions()) { throw std::runtime_error("ProcessGroup does not support alltoall"); } - virtual std::shared_ptr send( + virtual c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag) = 0; - virtual std::shared_ptr recv( + virtual c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag) = 0; - virtual std::shared_ptr recvAnysource( + virtual c10::intrusive_ptr recvAnysource( std::vector& tensors, int tag) = 0; - virtual std::shared_ptr barrier( + virtual c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) = 0; protected: diff --git a/torch/lib/c10d/ProcessGroupGloo.cpp b/torch/lib/c10d/ProcessGroupGloo.cpp index cd3e83e6b714..22da878cce43 100644 --- a/torch/lib/c10d/ProcessGroupGloo.cpp +++ b/torch/lib/c10d/ProcessGroupGloo.cpp @@ -38,6 +38,7 @@ #endif #include +#include #include #include #include @@ -107,7 +108,7 @@ namespace { // Wrap c10d store as Gloo store class GlooStore : public ::gloo::rendezvous::Store { public: - GlooStore(const std::shared_ptr<::c10d::Store>& store) : store_(store) {} + GlooStore(const c10::intrusive_ptr<::c10d::Store>& store) : store_(store) {} void set(const std::string& key, const std::vector& value) override { std::vector tmp(value.begin(), value.end()); @@ -130,7 +131,7 @@ class GlooStore : public ::gloo::rendezvous::Store { } protected: - std::shared_ptr<::c10d::Store> store_; + c10::intrusive_ptr<::c10d::Store> store_; }; typedef void (*ReduceFunc)(void*, const void*, const void*, size_t); @@ -561,7 +562,7 @@ std::shared_ptr<::gloo::transport::Device> ProcessGroupGloo:: #endif ProcessGroupGloo::ProcessGroupGloo( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, Options options) @@ -653,11 +654,11 @@ void ProcessGroupGloo::runLoop(int workerIndex) { AsyncWork::execute(std::move(work)); lock.lock(); - workInProgress_[workerIndex] = nullptr; + workInProgress_[workerIndex].reset(); } } -void ProcessGroupGloo::enqueue(std::shared_ptr work) { +void ProcessGroupGloo::enqueue(c10::intrusive_ptr work) { std::unique_lock lock(workMutex_); workQueue_.push_back(std::move(work)); lock.unlock(); @@ -773,7 +774,7 @@ class AsyncBroadcastCUDAWork : public AsyncBroadcastWork { } // namespace -std::shared_ptr ProcessGroupGloo::broadcast( +c10::intrusive_ptr ProcessGroupGloo::broadcast( std::vector& inputs, const BroadcastOptions& opts) { static auto invalidArgument = [](const std::string& msg) { @@ -796,15 +797,15 @@ std::shared_ptr ProcessGroupGloo::broadcast( invalidArgument(c10::str("unsupported device type ", device.type())); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.rootRank, opts.rootTensor, tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.rootRank, opts.rootTensor, tag); #endif } else { @@ -1300,7 +1301,7 @@ class AsyncSparseAllreduceCUDAWork : public AsyncSparseAllreduceWork { } // namespace -std::shared_ptr ProcessGroupGloo::allreduce( +c10::intrusive_ptr ProcessGroupGloo::allreduce( std::vector& inputs, const AllreduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { @@ -1329,15 +1330,15 @@ std::shared_ptr ProcessGroupGloo::allreduce( "(allreduce of sparse tensors only works with ReduceOp.SUM)"); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { if (layout == c10::kStrided) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.reduceOp, tag); } else if (layout == c10::kSparse) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, tag); } else { invalidArgument("unsupported layout"); @@ -1345,10 +1346,10 @@ std::shared_ptr ProcessGroupGloo::allreduce( #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { if (layout == c10::kStrided) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.reduceOp, tag); } else if (layout == c10::kSparse) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, tag); } else { invalidArgument("unsupported layout"); @@ -1362,7 +1363,7 @@ std::shared_ptr ProcessGroupGloo::allreduce( return work; } -std::shared_ptr ProcessGroupGloo::allreduce_coalesced( +c10::intrusive_ptr ProcessGroupGloo::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { static auto invalidArgument = [](const std::string& msg) { @@ -1405,12 +1406,12 @@ std::shared_ptr ProcessGroupGloo::allreduce_coalesced( invalidArgument("unsupported layout"); } - std::shared_ptr work; + c10::intrusive_ptr work; const uint32_t tag = nextTag(); std::shared_ptr context = getContext(tag); if (device.type() == c10::kCPU) { if (layout == c10::kStrided) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), tensors, opts.reduceOp, tag); } else { invalidArgument("unsupported layout"); @@ -1538,7 +1539,7 @@ class AsyncReduceCUDAWork : public AsyncReduceWork { } // namespace -std::shared_ptr ProcessGroupGloo::reduce( +c10::intrusive_ptr ProcessGroupGloo::reduce( std::vector& inputs, const ReduceOptions& opts) { static auto invalidArgument = [](const std::string& msg) { @@ -1561,11 +1562,11 @@ std::shared_ptr ProcessGroupGloo::reduce( invalidArgument(c10::str("unsupported device type ", device.type())); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.rootRank, @@ -1574,7 +1575,7 @@ std::shared_ptr ProcessGroupGloo::reduce( tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), inputs, opts.rootRank, @@ -1720,7 +1721,7 @@ class AsyncAllgatherCUDAWork : public AsyncAllgatherWork { // Note: current CUDA implementation holds the assumption that the // tensors in the nested output tensor vectors are on the same device. -std::shared_ptr ProcessGroupGloo::allgather( +c10::intrusive_ptr ProcessGroupGloo::allgather( std::vector>& outputs, std::vector& inputs, const AllgatherOptions& opts) { @@ -1769,15 +1770,15 @@ std::shared_ptr ProcessGroupGloo::allgather( invalidArgument(c10::str("unsupported device type ", device.type())); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, tag); #endif } else { @@ -1852,7 +1853,7 @@ class AsyncAllgatherCoalescedWork : public ProcessGroupGloo::AsyncWork { } // namespace -std::shared_ptr ProcessGroupGloo::allgather_coalesced( +c10::intrusive_ptr ProcessGroupGloo::allgather_coalesced( std::vector>& output_lists, std::vector& input_list, const AllgatherOptions& /* unused */) { @@ -1902,13 +1903,13 @@ std::shared_ptr ProcessGroupGloo::allgather_coalesced( auto tag = nextTag(); auto context = getContext(tag); - auto work = std::make_shared( + auto work = c10::make_intrusive( std::move(context), output_lists, input_list, tag); enqueue(work); return work; } -std::shared_ptr ProcessGroupGloo::allgather_base( +c10::intrusive_ptr ProcessGroupGloo::allgather_base( at::Tensor& /*unused */, at::Tensor& /*unused */, const AllgatherOptions& /*unused */) { @@ -2057,7 +2058,7 @@ class AsyncGatherCUDAWork : public AsyncGatherWork { } // namespace -std::shared_ptr ProcessGroupGloo::gather( +c10::intrusive_ptr ProcessGroupGloo::gather( std::vector>& outputs, std::vector& inputs, const GatherOptions& opts) { @@ -2103,15 +2104,15 @@ std::shared_ptr ProcessGroupGloo::gather( invalidArgument(c10::str("unsupported device type ", device.type())); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, opts.rootRank, tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, opts.rootRank, tag); #endif } else { @@ -2245,7 +2246,7 @@ class AsyncScatterCUDAWork : public AsyncScatterWork { } // namespace -std::shared_ptr ProcessGroupGloo::scatter( +c10::intrusive_ptr ProcessGroupGloo::scatter( std::vector& outputs, std::vector>& inputs, const ScatterOptions& opts) { @@ -2290,15 +2291,15 @@ std::shared_ptr ProcessGroupGloo::scatter( invalidArgument(c10::str("unsupported device type ", device.type())); } - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, opts.rootRank, tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputs, inputs, opts.rootRank, tag); #endif } else { @@ -2308,7 +2309,7 @@ std::shared_ptr ProcessGroupGloo::scatter( return work; } -std::shared_ptr ProcessGroupGloo::reduce_scatter( +c10::intrusive_ptr ProcessGroupGloo::reduce_scatter( std::vector& outputs, std::vector>& inputs, const ReduceScatterOptions& opts) { @@ -2443,7 +2444,7 @@ class AsyncAlltoallCUDAWork : public AsyncAlltoallWork { } // namespace -std::shared_ptr ProcessGroupGloo::alltoall_base( +c10::intrusive_ptr ProcessGroupGloo::alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputCounts, @@ -2460,12 +2461,12 @@ std::shared_ptr ProcessGroupGloo::alltoall_base( assertDense(invalidArgument, {inputTensor}); const auto& device = outputTensor.device(); - std::shared_ptr work; + c10::intrusive_ptr work; auto tag = nextTag(); auto context = getContext(tag); if (device.type() == at::kCPU) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputTensor, inputTensor, @@ -2474,7 +2475,7 @@ std::shared_ptr ProcessGroupGloo::alltoall_base( tag); #ifdef USE_CUDA } else if (device.type() == at::kCUDA) { - work = std::make_shared( + work = c10::make_intrusive( std::move(context), outputTensor, inputTensor, @@ -2510,7 +2511,7 @@ uint32_t checkTag(int32_t tag) { return (uint32_t)tag; } -std::shared_ptr ProcessGroupGloo::send( +c10::intrusive_ptr ProcessGroupGloo::send( std::vector& tensors, int dstRank, int tag) { @@ -2526,10 +2527,10 @@ std::shared_ptr ProcessGroupGloo::send( // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the send. - return std::make_shared(tensor, std::move(buf)); + return c10::make_intrusive(tensor, std::move(buf)); } -std::shared_ptr ProcessGroupGloo::recv( +c10::intrusive_ptr ProcessGroupGloo::recv( std::vector& tensors, int srcRank, int tag) { @@ -2545,10 +2546,10 @@ std::shared_ptr ProcessGroupGloo::recv( // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the recv. - return std::make_shared(tensor, std::move(buf)); + return c10::make_intrusive(tensor, std::move(buf)); } -std::shared_ptr ProcessGroupGloo::recvAnysource( +c10::intrusive_ptr ProcessGroupGloo::recvAnysource( std::vector& tensors, int tag) { auto& tensor = checkSingleTensor(tensors); @@ -2573,7 +2574,7 @@ std::shared_ptr ProcessGroupGloo::recvAnysource( // The work captures the tensor to prevent it being deallocated and // the unbound buffer to synchronize on completion of the recv. - return std::make_shared(tensor, std::move(buf)); + return c10::make_intrusive(tensor, std::move(buf)); } namespace { @@ -2582,13 +2583,13 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork { public: AsyncBarrierWork( const std::shared_ptr& context, - std::vector> priorWork, + std::vector> priorWork, uint32_t tag) : ProcessGroupGloo::AsyncWork("gloo:barrier"), context(context), priorWork(std::move(priorWork)), tag(tag) {} std::shared_ptr context; - std::vector> priorWork; + std::vector> priorWork; const uint32_t tag; void run() override { @@ -2608,9 +2609,9 @@ class AsyncBarrierWork : public ProcessGroupGloo::AsyncWork { } // namespace -std::shared_ptr ProcessGroupGloo::barrier( +c10::intrusive_ptr ProcessGroupGloo::barrier( const BarrierOptions& opts) { - std::vector> priorWork; + std::vector> priorWork; // Snapshot all in progress and pending work as weak_ptr. // When executing a barrier, we need to ensure that all prior work @@ -2624,7 +2625,7 @@ std::shared_ptr ProcessGroupGloo::barrier( auto tag = nextTag(); auto context = getContext(tag); - auto work = std::make_shared( + auto work = c10::make_intrusive( std::move(context), std::move(priorWork), tag); enqueue(work); return work; diff --git a/torch/lib/c10d/ProcessGroupGloo.hpp b/torch/lib/c10d/ProcessGroupGloo.hpp index 31664ad0b6cf..0508b6f857a1 100644 --- a/torch/lib/c10d/ProcessGroupGloo.hpp +++ b/torch/lib/c10d/ProcessGroupGloo.hpp @@ -70,7 +70,7 @@ class ProcessGroupGloo : public ProcessGroup { public: AsyncWork(const char* profilingTitle = nullptr): ProcessGroup::Work(-1, OpType::UNKNOWN, profilingTitle) {} - static void execute(std::shared_ptr work) { + static void execute(c10::intrusive_ptr work) { std::exception_ptr eptr; try { work->run(); @@ -152,82 +152,82 @@ class ProcessGroupGloo : public ProcessGroup { static std::shared_ptr<::gloo::transport::Device> createDefaultDevice(); explicit ProcessGroupGloo( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, Options options = Options()); virtual ~ProcessGroupGloo(); - std::shared_ptr broadcast( + c10::intrusive_ptr broadcast( std::vector& tensors, const BroadcastOptions& opts = BroadcastOptions()) override; - std::shared_ptr allreduce( + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - std::shared_ptr allreduce_coalesced( + c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override; - std::shared_ptr reduce( + c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) override; - std::shared_ptr allgather( + c10::intrusive_ptr allgather( std::vector>& outputs, std::vector& inputs, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_base( + c10::intrusive_ptr allgather_base( at::Tensor& outputBuffer, at::Tensor& inputBuffer, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_coalesced( + c10::intrusive_ptr allgather_coalesced( std::vector>& output_lists, std::vector& input_list, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr gather( + c10::intrusive_ptr gather( std::vector>& outputs, std::vector& inputs, const GatherOptions& opts = GatherOptions()) override; - std::shared_ptr scatter( + c10::intrusive_ptr scatter( std::vector& outputs, std::vector>& inputs, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr reduce_scatter( + c10::intrusive_ptr reduce_scatter( std::vector& outputs, std::vector>& inputs, const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - std::shared_ptr alltoall_base( + c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputCounts, std::vector& inputCounts, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr send( + c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag) override; - std::shared_ptr recv( + c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag) override; - std::shared_ptr recvAnysource( + c10::intrusive_ptr recvAnysource( std::vector& tensors, int tag) override; - std::shared_ptr barrier( + c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override; protected: @@ -258,7 +258,7 @@ class ProcessGroupGloo : public ProcessGroup { void runLoop(int workerIndex); // Queue work to run on worker thread. - void enqueue(std::shared_ptr work); + void enqueue(c10::intrusive_ptr work); // Keep both a queue of pending work, and a vector with in progress work. // Both of these can only be mutated when holding the queue lock. @@ -266,8 +266,8 @@ class ProcessGroupGloo : public ProcessGroup { // to all in progress and pending work when executing a barrier. // When executing a barrier, we need to ensure that all prior work // has completed before completing itself. - std::deque> workQueue_; - std::vector> workInProgress_; + std::deque> workQueue_; + std::vector> workInProgress_; std::mutex workMutex_; std::condition_variable workProduceCV_; std::condition_variable workConsumeCV_; diff --git a/torch/lib/c10d/ProcessGroupMPI.cpp b/torch/lib/c10d/ProcessGroupMPI.cpp index d3e79a1dd424..5f9d0be41b8f 100644 --- a/torch/lib/c10d/ProcessGroupMPI.cpp +++ b/torch/lib/c10d/ProcessGroupMPI.cpp @@ -308,9 +308,9 @@ void ProcessGroupMPI::runLoop() { } } -std::shared_ptr ProcessGroupMPI::enqueue( +c10::intrusive_ptr ProcessGroupMPI::enqueue( std::unique_ptr entry) { - auto work = std::make_shared(); + auto work = c10::make_intrusive(); std::unique_lock lock(pgMutex_); queue_.push_back(std::make_tuple(std::move(entry), work)); lock.unlock(); @@ -318,7 +318,7 @@ std::shared_ptr ProcessGroupMPI::enqueue( return work; } -std::shared_ptr ProcessGroupMPI::broadcast( +c10::intrusive_ptr ProcessGroupMPI::broadcast( std::vector& tensors, const BroadcastOptions& opts) { checkSingleTensor(tensors); @@ -339,7 +339,7 @@ std::shared_ptr ProcessGroupMPI::broadcast( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::allreduce( +c10::intrusive_ptr ProcessGroupMPI::allreduce( std::vector& tensors, const AllreduceOptions& opts) { checkSingleTensor(tensors); @@ -362,14 +362,14 @@ std::shared_ptr ProcessGroupMPI::allreduce( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::allreduce_coalesced( +c10::intrusive_ptr ProcessGroupMPI::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { throw std::runtime_error( "allreduce_coalesced is currently not supported with MPI"); } -std::shared_ptr ProcessGroupMPI::reduce( +c10::intrusive_ptr ProcessGroupMPI::reduce( std::vector& tensors, const ReduceOptions& opts) { checkSingleTensor(tensors); @@ -397,7 +397,7 @@ std::shared_ptr ProcessGroupMPI::reduce( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::allgather( +c10::intrusive_ptr ProcessGroupMPI::allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts) { @@ -441,7 +441,7 @@ std::shared_ptr ProcessGroupMPI::allgather( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::allgather_coalesced( +c10::intrusive_ptr ProcessGroupMPI::allgather_coalesced( std::vector>& /* unused */, std::vector& /* unused */, const AllgatherOptions& /* unused */) { @@ -449,7 +449,7 @@ std::shared_ptr ProcessGroupMPI::allgather_coalesced( "ProcessGroupMPI does not support allgather_coalesced"); } -std::shared_ptr ProcessGroupMPI::gather( +c10::intrusive_ptr ProcessGroupMPI::gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts) { @@ -516,7 +516,7 @@ std::shared_ptr ProcessGroupMPI::gather( } } -std::shared_ptr ProcessGroupMPI::scatter( +c10::intrusive_ptr ProcessGroupMPI::scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts) { @@ -582,14 +582,14 @@ std::shared_ptr ProcessGroupMPI::scatter( } } -std::shared_ptr ProcessGroupMPI::reduce_scatter( +c10::intrusive_ptr ProcessGroupMPI::reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts) { throw std::runtime_error("ProcessGroupMPI does not support reduce_scatter"); } -std::shared_ptr ProcessGroupMPI::alltoall_base( +c10::intrusive_ptr ProcessGroupMPI::alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, @@ -665,7 +665,7 @@ std::shared_ptr ProcessGroupMPI::alltoall_base( return enqueue(std::move(entry)); } } -std::shared_ptr ProcessGroupMPI::alltoall( +c10::intrusive_ptr ProcessGroupMPI::alltoall( std::vector& outputTensors, std::vector& inputTensors, const AllToAllOptions& opts) { @@ -722,7 +722,7 @@ std::shared_ptr ProcessGroupMPI::alltoall( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::send( +c10::intrusive_ptr ProcessGroupMPI::send( std::vector& tensors, int dstRank, int tag) { @@ -744,10 +744,10 @@ std::shared_ptr ProcessGroupMPI::send( &request)); } - return std::make_shared(tensor, request); + return c10::make_intrusive(tensor, request); } -std::shared_ptr ProcessGroupMPI::recv( +c10::intrusive_ptr ProcessGroupMPI::recv( std::vector& tensors, int srcRank, int tag) { @@ -769,10 +769,10 @@ std::shared_ptr ProcessGroupMPI::recv( &request)); } - return std::make_shared(tensor, request); + return c10::make_intrusive(tensor, request); } -std::shared_ptr ProcessGroupMPI::recvAnysource( +c10::intrusive_ptr ProcessGroupMPI::recvAnysource( std::vector& tensors, int tag) { checkSingleTensor(tensors); @@ -793,10 +793,10 @@ std::shared_ptr ProcessGroupMPI::recvAnysource( &request)); } - return std::make_shared(tensor, request); + return c10::make_intrusive(tensor, request); } -std::shared_ptr ProcessGroupMPI::barrier( +c10::intrusive_ptr ProcessGroupMPI::barrier( const BarrierOptions& opts) { std::function&)> runFunc = [this](std::unique_ptr& entry) { @@ -808,7 +808,7 @@ std::shared_ptr ProcessGroupMPI::barrier( return enqueue(std::move(entry)); } -std::shared_ptr ProcessGroupMPI::allgather_base( +c10::intrusive_ptr ProcessGroupMPI::allgather_base( at::Tensor& /*unused */, at::Tensor& /*unused */, const AllgatherOptions& /*unused */) { diff --git a/torch/lib/c10d/ProcessGroupMPI.hpp b/torch/lib/c10d/ProcessGroupMPI.hpp index 342fe87001a0..48d95eada887 100644 --- a/torch/lib/c10d/ProcessGroupMPI.hpp +++ b/torch/lib/c10d/ProcessGroupMPI.hpp @@ -108,80 +108,80 @@ class ProcessGroupMPI : public ProcessGroup { // Abort the MPI program, needs to be called when exception is detected void abort(); - std::shared_ptr broadcast( + c10::intrusive_ptr broadcast( std::vector& data, const BroadcastOptions& opts = BroadcastOptions()) override; - std::shared_ptr allreduce( + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - std::shared_ptr allreduce_coalesced( + c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override; - std::shared_ptr reduce( + c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) override; - std::shared_ptr allgather( + c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_base( + c10::intrusive_ptr allgather_base( at::Tensor& outputbuffer, at::Tensor& inputbuffer, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_coalesced( + c10::intrusive_ptr allgather_coalesced( std::vector>& outputTensorLists, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr gather( + c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts = GatherOptions()) override; - std::shared_ptr scatter( + c10::intrusive_ptr scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr reduce_scatter( + c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - std::shared_ptr alltoall_base( + c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, std::vector& inputSplitSizes, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr alltoall( + c10::intrusive_ptr alltoall( std::vector& outputTensors, std::vector& inputTensors, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr send( + c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag); - std::shared_ptr recv( + c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag); - std::shared_ptr recvAnysource( + c10::intrusive_ptr recvAnysource( std::vector& tensor, int tag); - std::shared_ptr barrier( + c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override; // Creating a new ProcessGroupMPI, will initiialize MPI if not initialized @@ -190,13 +190,13 @@ class ProcessGroupMPI : public ProcessGroup { protected: using WorkType = - std::tuple, std::shared_ptr>; + std::tuple, c10::intrusive_ptr>; // Worker thread loop void runLoop(); // Helper function that is called by the destructor void destroy(); - std::shared_ptr enqueue(std::unique_ptr entry); + c10::intrusive_ptr enqueue(std::unique_ptr entry); bool stop_; diff --git a/torch/lib/c10d/ProcessGroupNCCL.cpp b/torch/lib/c10d/ProcessGroupNCCL.cpp index ba0b4b36c77d..acb81d0cad6d 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.cpp +++ b/torch/lib/c10d/ProcessGroupNCCL.cpp @@ -437,7 +437,7 @@ bool ProcessGroupNCCL::WorkNCCL::timedOut() { } ProcessGroupNCCL::ProcessGroupNCCL( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, Options options) @@ -984,12 +984,12 @@ std::vector flatten_for_scatter_gather( } // namespace -std::shared_ptr ProcessGroupNCCL::initWork( +c10::intrusive_ptr ProcessGroupNCCL::initWork( std::vector devices, int rank, OpType opType, const char* profilingTitle) { - return std::make_shared(devices, rank, opType, profilingTitle); + return c10::make_intrusive(devices, rank, opType); } std::vector ProcessGroupNCCL::WorkNCCL::result() { @@ -1012,7 +1012,7 @@ c10::intrusive_ptr ProcessGroupNCCL::WorkNCCL:: } void ProcessGroupNCCL::workEnqueue( - std::shared_ptr work) { + c10::intrusive_ptr work) { if (!terminateProcessGroup_.load()) { std::lock_guard lock(workMetaListMutex_); // Avoid view tensors to be processed in cleanup thread. @@ -1027,7 +1027,7 @@ ProcessGroupNCCL::Options::Options() isHighPriorityStream(false) {} template -std::shared_ptr ProcessGroupNCCL::collective( +c10::intrusive_ptr ProcessGroupNCCL::collective( std::vector& inputs, std::vector& outputs, Fn fn, @@ -1114,7 +1114,7 @@ std::shared_ptr ProcessGroupNCCL::collective( } template -std::shared_ptr ProcessGroupNCCL::pointToPoint( +c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( std::vector& tensors, Fn fn, int peer, @@ -1186,7 +1186,7 @@ std::shared_ptr ProcessGroupNCCL::pointToPoint( } template -std::shared_ptr ProcessGroupNCCL::collective( +c10::intrusive_ptr ProcessGroupNCCL::collective( std::vector& inputs, std::vector& outputs, Fn fn, @@ -1203,7 +1203,7 @@ std::shared_ptr ProcessGroupNCCL::collective( } template -std::shared_ptr ProcessGroupNCCL::pointToPoint( +c10::intrusive_ptr ProcessGroupNCCL::pointToPoint( std::vector& tensor, Fn fn, int peer, @@ -1217,7 +1217,7 @@ std::shared_ptr ProcessGroupNCCL::pointToPoint( [](std::vector&) {}); } -std::shared_ptr ProcessGroupNCCL::allreduce( +c10::intrusive_ptr ProcessGroupNCCL::allreduce( std::vector& tensors, const AllreduceOptions& opts) { check_gpu_tensors(tensors); @@ -1242,14 +1242,14 @@ std::shared_ptr ProcessGroupNCCL::allreduce( "nccl:all_reduce"); } -std::shared_ptr ProcessGroupNCCL::allreduce_coalesced( +c10::intrusive_ptr ProcessGroupNCCL::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { throw std::runtime_error( "allreduce_coalesced is currently not supported with NCCL"); } -std::shared_ptr ProcessGroupNCCL::broadcast( +c10::intrusive_ptr ProcessGroupNCCL::broadcast( std::vector& tensors, const BroadcastOptions& opts) { check_gpu_tensors(tensors); @@ -1274,7 +1274,7 @@ std::shared_ptr ProcessGroupNCCL::broadcast( "nccl:broadcast"); } -std::shared_ptr ProcessGroupNCCL::reduce( +c10::intrusive_ptr ProcessGroupNCCL::reduce( std::vector& tensors, const ReduceOptions& opts) { check_gpu_tensors(tensors); @@ -1301,7 +1301,7 @@ std::shared_ptr ProcessGroupNCCL::reduce( "nccl:reduce"); } -std::shared_ptr ProcessGroupNCCL::allgather( +c10::intrusive_ptr ProcessGroupNCCL::allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts) { @@ -1346,7 +1346,7 @@ std::shared_ptr ProcessGroupNCCL::allgather( "nccl:all_gather"); } -std::shared_ptr ProcessGroupNCCL::allgather_coalesced( +c10::intrusive_ptr ProcessGroupNCCL::allgather_coalesced( std::vector>& /* unused */, std::vector& /* unused */, const AllgatherOptions& /* unused */) { @@ -1354,7 +1354,7 @@ std::shared_ptr ProcessGroupNCCL::allgather_coalesced( "ProcessGroupNCCL does not support allgather_coalesced"); } -std::shared_ptr ProcessGroupNCCL::reduce_scatter( +c10::intrusive_ptr ProcessGroupNCCL::reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts) { @@ -1400,7 +1400,7 @@ std::shared_ptr ProcessGroupNCCL::reduce_scatter( "nccl:reduce_scatter"); } -std::shared_ptr ProcessGroupNCCL::barrier( +c10::intrusive_ptr ProcessGroupNCCL::barrier( const BarrierOptions& opts) { std::vector devices; if (usedDeviceIdxs_.empty()) { @@ -1441,7 +1441,7 @@ std::shared_ptr ProcessGroupNCCL::barrier( } #ifdef ENABLE_NCCL_P2P_SUPPORT -std::shared_ptr ProcessGroupNCCL::alltoall_base( +c10::intrusive_ptr ProcessGroupNCCL::alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, @@ -1512,7 +1512,7 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( } } -std::shared_ptr ProcessGroupNCCL::send( +c10::intrusive_ptr ProcessGroupNCCL::send( std::vector& tensors, int dstRank, int /* unused */) { @@ -1531,7 +1531,7 @@ std::shared_ptr ProcessGroupNCCL::send( return ret; } -std::shared_ptr ProcessGroupNCCL::recv( +c10::intrusive_ptr ProcessGroupNCCL::recv( std::vector& tensors, int srcRank, int /* unused */) { @@ -1550,7 +1550,7 @@ std::shared_ptr ProcessGroupNCCL::recv( return ret; } #else -std::shared_ptr ProcessGroupNCCL::alltoall_base( +c10::intrusive_ptr ProcessGroupNCCL::alltoall_base( at::Tensor& /* unused */, at::Tensor& /* unused */, std::vector& /* unused */, @@ -1560,7 +1560,7 @@ std::shared_ptr ProcessGroupNCCL::alltoall_base( "ProcessGroupNCCL only supports alltoall* for NCCL lib version >= 2.7.0"); } -std::shared_ptr ProcessGroupNCCL::send( +c10::intrusive_ptr ProcessGroupNCCL::send( std::vector& /* unused */, int /* unused */, int /* unused */) { @@ -1568,7 +1568,7 @@ std::shared_ptr ProcessGroupNCCL::send( "ProcessGroupNCCL only supports send for NCCL lib version >= 2.7.0"); } -std::shared_ptr ProcessGroupNCCL::recv( +c10::intrusive_ptr ProcessGroupNCCL::recv( std::vector& /* unused */, int /* unused */, int /* unused */) { @@ -1591,34 +1591,34 @@ void ProcessGroupNCCL::groupEnd() { --ncclActiveGroupCounter_; } -std::shared_ptr ProcessGroupNCCL::alltoall( +c10::intrusive_ptr ProcessGroupNCCL::alltoall( std::vector& /* unused */, std::vector& /* unused */, const AllToAllOptions& /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support alltoall"); } -std::shared_ptr ProcessGroupNCCL::gather( +c10::intrusive_ptr ProcessGroupNCCL::gather( std::vector>& /* unused */, std::vector& /* unused */, const GatherOptions& /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support gather"); } -std::shared_ptr ProcessGroupNCCL::scatter( +c10::intrusive_ptr ProcessGroupNCCL::scatter( std::vector& /* unused */, std::vector>& /* unused */, const ScatterOptions& /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support scatter"); } -std::shared_ptr ProcessGroupNCCL::recvAnysource( +c10::intrusive_ptr ProcessGroupNCCL::recvAnysource( std::vector& /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupNCCL does not support recvAnysource"); } -std::shared_ptr ProcessGroupNCCL::allgather_base( +c10::intrusive_ptr ProcessGroupNCCL::allgather_base( at::Tensor& /*unused */, at::Tensor& /*unused */, const AllgatherOptions& /*unused */) { diff --git a/torch/lib/c10d/ProcessGroupNCCL.hpp b/torch/lib/c10d/ProcessGroupNCCL.hpp index 1520604629f2..b93bd0c2d70c 100644 --- a/torch/lib/c10d/ProcessGroupNCCL.hpp +++ b/torch/lib/c10d/ProcessGroupNCCL.hpp @@ -65,7 +65,7 @@ constexpr const char* NCCL_ASYNC_ERROR_HANDLING = "NCCL_ASYNC_ERROR_HANDLING"; class ProcessGroupNCCL : public ProcessGroup { public: class WorkNCCL : public ProcessGroup::Work, - public std::enable_shared_from_this { + public std::enable_shared_from_this { public: // Constructor takes a list of CUDA devices WorkNCCL(const std::vector& devices, int rank, OpType opType, const char* profilingTitle = nullptr); @@ -163,7 +163,7 @@ class ProcessGroupNCCL : public ProcessGroup { // Reference to the store so that we can write aborted communicators // to the store. - std::shared_ptr store_; + c10::intrusive_ptr store_; // Store a reference to NCCL collective's outputs to be used by getFuture. std::shared_ptr> outputs_; @@ -393,7 +393,7 @@ class ProcessGroupNCCL : public ProcessGroup { // communicator. These NCCL communicators are cached and reused if possible. // ProcessGroupNCCL( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, Options options = Options()); @@ -402,7 +402,7 @@ class ProcessGroupNCCL : public ProcessGroup { // If you have existing code that uses the `groupName`, you can replace // it by specifying a `c10d::PrefixStore(groupName, store)` for store. C10_DEPRECATED ProcessGroupNCCL( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, const std::string& groupName, @@ -411,64 +411,64 @@ class ProcessGroupNCCL : public ProcessGroup { virtual ~ProcessGroupNCCL(); - std::shared_ptr broadcast( + c10::intrusive_ptr broadcast( std::vector& tensors, const BroadcastOptions& opts = BroadcastOptions()) override; - std::shared_ptr allreduce( + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - std::shared_ptr allreduce_coalesced( + c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override; - std::shared_ptr reduce( + c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) override; - std::shared_ptr allgather( + c10::intrusive_ptr allgather( std::vector>& outputTensors, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_base( + c10::intrusive_ptr allgather_base( at::Tensor& outputbuffer, at::Tensor& inputbuffer, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_coalesced( + c10::intrusive_ptr allgather_coalesced( std::vector>& outputTensorLists, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr reduce_scatter( + c10::intrusive_ptr reduce_scatter( std::vector& outputTensors, std::vector>& inputTensors, const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - std::shared_ptr barrier( + c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override; - std::shared_ptr alltoall_base( + c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, std::vector& inputSplitSizes, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr alltoall( + c10::intrusive_ptr alltoall( std::vector& outputTensors, std::vector& inputTensors, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr send( + c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag) override; - std::shared_ptr recv( + c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag) override; @@ -478,17 +478,17 @@ class ProcessGroupNCCL : public ProcessGroup { static void groupEnd(); // Unsupported Ops - std::shared_ptr gather( + c10::intrusive_ptr gather( std::vector>& outputTensors, std::vector& inputTensors, const GatherOptions& opts = GatherOptions()) override; - std::shared_ptr scatter( + c10::intrusive_ptr scatter( std::vector& outputTensors, std::vector>& inputTensors, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr recvAnysource( + c10::intrusive_ptr recvAnysource( std::vector& tensors, int tag) override; @@ -515,7 +515,7 @@ class ProcessGroupNCCL : public ProcessGroup { virtual std::exception_ptr checkForNCCLErrors( const std::vector>& ncclComms); - virtual std::shared_ptr initWork( + virtual c10::intrusive_ptr initWork( std::vector devices, int rank, OpType opType, @@ -529,14 +529,14 @@ class ProcessGroupNCCL : public ProcessGroup { // ncclComm_t, at::cuda::CUDAStream&); // void {pre,post}(std::vector); template - std::shared_ptr collective( + c10::intrusive_ptr collective( std::vector& input, std::vector& output, Fn fn, OpType opType, const char* profilingTitle = nullptr); template - std::shared_ptr collective( + c10::intrusive_ptr collective( std::vector& input, std::vector& output, Fn fn, @@ -549,13 +549,13 @@ class ProcessGroupNCCL : public ProcessGroup { // primitives. It is the same structure as the helper used for collective // communicaiton primitives. template - std::shared_ptr pointToPoint( + c10::intrusive_ptr pointToPoint( std::vector& tensor, Fn fn, int peer, OpType opType); template - std::shared_ptr pointToPoint( + c10::intrusive_ptr pointToPoint( std::vector& tensor, Fn fn, int peer, @@ -594,7 +594,7 @@ class ProcessGroupNCCL : public ProcessGroup { static const int64_t kWorkCleanupThreadSleepMillis; // The store is used to broadcast the NCCL unique ID of rank 0. - std::shared_ptr store_; + c10::intrusive_ptr store_; // The number of NCCL communicators that have been created during // the lifetime of this process group. This sequence number is @@ -664,7 +664,7 @@ class ProcessGroupNCCL : public ProcessGroup { std::list workMetaList_; // Add Work Pointer to workVector - void workEnqueue(std::shared_ptr); + void workEnqueue(c10::intrusive_ptr); // The CUDA steams used by NCCL kernels std::unordered_map> diff --git a/torch/lib/c10d/ProcessGroupRoundRobin.cpp b/torch/lib/c10d/ProcessGroupRoundRobin.cpp index 032f63c320f5..c77188577a62 100644 --- a/torch/lib/c10d/ProcessGroupRoundRobin.cpp +++ b/torch/lib/c10d/ProcessGroupRoundRobin.cpp @@ -17,66 +17,66 @@ ProcessGroupRoundRobin::ProcessGroupRoundRobin( ProcessGroupRoundRobin::~ProcessGroupRoundRobin() {} -std::shared_ptr ProcessGroupRoundRobin::broadcast( +c10::intrusive_ptr ProcessGroupRoundRobin::broadcast( std::vector& tensors, const BroadcastOptions& opts) { return next()->broadcast(tensors, opts); } -std::shared_ptr ProcessGroupRoundRobin::allreduce( +c10::intrusive_ptr ProcessGroupRoundRobin::allreduce( std::vector& tensors, const AllreduceOptions& opts) { return next()->allreduce(tensors, opts); } -std::shared_ptr ProcessGroupRoundRobin::allreduce_coalesced( +c10::intrusive_ptr ProcessGroupRoundRobin::allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts) { return next()->allreduce_coalesced(tensors, opts); } -std::shared_ptr ProcessGroupRoundRobin::reduce( +c10::intrusive_ptr ProcessGroupRoundRobin::reduce( std::vector& tensors, const ReduceOptions& opts) { return next()->reduce(tensors, opts); } -std::shared_ptr ProcessGroupRoundRobin::allgather( +c10::intrusive_ptr ProcessGroupRoundRobin::allgather( std::vector>& outputs, std::vector& inputs, const AllgatherOptions& opts) { return next()->allgather(outputs, inputs, opts); }; -std::shared_ptr ProcessGroupRoundRobin::allgather_coalesced( +c10::intrusive_ptr ProcessGroupRoundRobin::allgather_coalesced( std::vector>& outputTensorLists, std::vector& inputTensors, const AllgatherOptions& opts) { return next()->allgather(outputTensorLists, inputTensors, opts); } -std::shared_ptr ProcessGroupRoundRobin::gather( +c10::intrusive_ptr ProcessGroupRoundRobin::gather( std::vector>& outputs, std::vector& inputs, const GatherOptions& opts) { return next()->gather(outputs, inputs, opts); }; -std::shared_ptr ProcessGroupRoundRobin::scatter( +c10::intrusive_ptr ProcessGroupRoundRobin::scatter( std::vector& outputs, std::vector>& inputs, const ScatterOptions& opts) { return next()->scatter(outputs, inputs, opts); }; -std::shared_ptr ProcessGroupRoundRobin::reduce_scatter( +c10::intrusive_ptr ProcessGroupRoundRobin::reduce_scatter( std::vector& outputs, std::vector>& inputs, const ReduceScatterOptions& opts) { return next()->reduce_scatter(outputs, inputs, opts); }; -std::shared_ptr ProcessGroupRoundRobin::alltoall_base( +c10::intrusive_ptr ProcessGroupRoundRobin::alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, @@ -86,27 +86,27 @@ std::shared_ptr ProcessGroupRoundRobin::alltoall_base( outputTensor, inputTensor, outputSplitSizes, inputSplitSizes, opts); }; -std::shared_ptr ProcessGroupRoundRobin::send( +c10::intrusive_ptr ProcessGroupRoundRobin::send( std::vector& /* unused */, int /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupRoundRobin does not support send"); }; -std::shared_ptr ProcessGroupRoundRobin::recv( +c10::intrusive_ptr ProcessGroupRoundRobin::recv( std::vector& /* unused */, int /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupRoundRobin does not support recv"); }; -std::shared_ptr ProcessGroupRoundRobin::recvAnysource( +c10::intrusive_ptr ProcessGroupRoundRobin::recvAnysource( std::vector& /* unused */, int /* unused */) { throw std::runtime_error("ProcessGroupRoundRobin does not support recv"); }; -std::shared_ptr ProcessGroupRoundRobin::barrier( +c10::intrusive_ptr ProcessGroupRoundRobin::barrier( const BarrierOptions& /* unused */) { throw std::runtime_error("ProcessGroupRoundRobin does not support barrier"); }; @@ -120,7 +120,7 @@ const std::shared_ptr& ProcessGroupRoundRobin::next() { return processGroup; } -std::shared_ptr ProcessGroupRoundRobin::allgather_base( +c10::intrusive_ptr ProcessGroupRoundRobin::allgather_base( at::Tensor& /*unused */, at::Tensor& /*unused */, const AllgatherOptions& /*unused */) { diff --git a/torch/lib/c10d/ProcessGroupRoundRobin.hpp b/torch/lib/c10d/ProcessGroupRoundRobin.hpp index bbbd0a1c756b..62d59ef18ce5 100644 --- a/torch/lib/c10d/ProcessGroupRoundRobin.hpp +++ b/torch/lib/c10d/ProcessGroupRoundRobin.hpp @@ -25,75 +25,75 @@ class ProcessGroupRoundRobin final : public ProcessGroup { ~ProcessGroupRoundRobin() override; - std::shared_ptr broadcast( + c10::intrusive_ptr broadcast( std::vector& tensors, const BroadcastOptions& opts = BroadcastOptions()) override; - std::shared_ptr allreduce( + c10::intrusive_ptr allreduce( std::vector& tensors, const AllreduceOptions& opts = AllreduceOptions()) override; - std::shared_ptr allreduce_coalesced( + c10::intrusive_ptr allreduce_coalesced( std::vector& tensors, const AllreduceCoalescedOptions& opts = AllreduceCoalescedOptions()) override; - std::shared_ptr reduce( + c10::intrusive_ptr reduce( std::vector& tensors, const ReduceOptions& opts = ReduceOptions()) override; - std::shared_ptr allgather( + c10::intrusive_ptr allgather( std::vector>& outputs, std::vector& inputs, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_base( + c10::intrusive_ptr allgather_base( at::Tensor& outputBuffer, at::Tensor& inputBuffer, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr allgather_coalesced( + c10::intrusive_ptr allgather_coalesced( std::vector>& outputTensorLists, std::vector& inputTensors, const AllgatherOptions& opts = AllgatherOptions()) override; - std::shared_ptr gather( + c10::intrusive_ptr gather( std::vector>& outputs, std::vector& inputs, const GatherOptions& opts = GatherOptions()) override; - std::shared_ptr scatter( + c10::intrusive_ptr scatter( std::vector& outputs, std::vector>& inputs, const ScatterOptions& opts = ScatterOptions()) override; - std::shared_ptr reduce_scatter( + c10::intrusive_ptr reduce_scatter( std::vector& outputs, std::vector>& inputs, const ReduceScatterOptions& opts = ReduceScatterOptions()) override; - std::shared_ptr alltoall_base( + c10::intrusive_ptr alltoall_base( at::Tensor& outputTensor, at::Tensor& inputTensor, std::vector& outputSplitSizes, std::vector& inputSplitSizes, const AllToAllOptions& opts = AllToAllOptions()) override; - std::shared_ptr send( + c10::intrusive_ptr send( std::vector& tensors, int dstRank, int tag) override; - std::shared_ptr recv( + c10::intrusive_ptr recv( std::vector& tensors, int srcRank, int tag) override; - std::shared_ptr recvAnysource( + c10::intrusive_ptr recvAnysource( std::vector& tensors, int tag) override; - std::shared_ptr barrier( + c10::intrusive_ptr barrier( const BarrierOptions& opts = BarrierOptions()) override; private: diff --git a/torch/lib/c10d/Store.hpp b/torch/lib/c10d/Store.hpp index e42bbf300e0b..f97e80013cdb 100644 --- a/torch/lib/c10d/Store.hpp +++ b/torch/lib/c10d/Store.hpp @@ -6,9 +6,11 @@ #include #include +#include + namespace c10d { -class Store { +class Store : public torch::CustomClassHolder { public: static constexpr std::chrono::milliseconds kDefaultTimeout = std::chrono::seconds(300); diff --git a/torch/lib/c10d/comm.cpp b/torch/lib/c10d/comm.cpp index a8628e0c942e..5ef88f058aca 100644 --- a/torch/lib/c10d/comm.cpp +++ b/torch/lib/c10d/comm.cpp @@ -45,8 +45,10 @@ class BroadcastWork { // because c10d::ProcessGroup::broadcast takes a vector argument. std::vector flat_tensor_; + private: + // The broadcast work that is kicked off upon construction. - std::shared_ptr work_; + c10::intrusive_ptr work_; }; } // namespace diff --git a/torch/lib/c10d/example/allreduce.cpp b/torch/lib/c10d/example/allreduce.cpp index 76d6a5588f7e..3de7447d092a 100644 --- a/torch/lib/c10d/example/allreduce.cpp +++ b/torch/lib/c10d/example/allreduce.cpp @@ -19,7 +19,7 @@ int main(int argc, char** argv) { } // Kick off work - std::vector> pending; + std::vector> pending; for (auto i = 0; i < ntensors; i++) { std::vector tmp = {tensors[i]}; pending.push_back(pg.allreduce(tmp)); diff --git a/torch/lib/c10d/frontend.hpp b/torch/lib/c10d/frontend.hpp index 69705427b53c..3449ee30b5ef 100644 --- a/torch/lib/c10d/frontend.hpp +++ b/torch/lib/c10d/frontend.hpp @@ -35,7 +35,7 @@ class DistributedC10d { const std::chrono::milliseconds& timeout, int64_t world_size, int64_t rank, - std::shared_ptr store, + c10::intrusive_ptr store, const std::string& group_name); void destroyProcessGroup(std::shared_ptr group); @@ -202,7 +202,7 @@ class DistributedC10d { // need to use ProcessGroup or ProcesGroup* as key. std::unordered_map< std::shared_ptr, - std::pair>> + std::pair>> pg_map_; // Note, this is different mapping relationship than original Python diff --git a/torch/lib/c10d/reducer.cpp b/torch/lib/c10d/reducer.cpp index c05ce685bb7d..c5ee54a9ee8e 100644 --- a/torch/lib/c10d/reducer.cpp +++ b/torch/lib/c10d/reducer.cpp @@ -472,7 +472,7 @@ std::vector> Reducer::get_bucket_tensors() const { } void Reducer::set_forward_pass_work_handle( - std::shared_ptr forwardPassWorkHandle, + c10::intrusive_ptr forwardPassWorkHandle, bool useStaticWorldSize) { std::lock_guard lock(mutex_); forwardPassWorkHandle_.workHandle = std::move(forwardPassWorkHandle); diff --git a/torch/lib/c10d/reducer.hpp b/torch/lib/c10d/reducer.hpp index 4874f0dd8703..e0fe0004f88e 100644 --- a/torch/lib/c10d/reducer.hpp +++ b/torch/lib/c10d/reducer.hpp @@ -8,6 +8,7 @@ #include #include +#include #include #include #include @@ -96,7 +97,7 @@ class Reducer { // Creates and sets ForwardPassWorkHandle given a ProcessGroup::Work and the // corresponding tensor being reduced. void set_forward_pass_work_handle( - std::shared_ptr forwardPassWorkHandle, + c10::intrusive_ptr forwardPassWorkHandle, bool useStaticWorldSize); // Retrieve on-device tensors used to track locally unused parameters. For @@ -158,7 +159,7 @@ class Reducer { bool local_used_maps_reduced_; // Work handle for allreduce on local_used_maps_ - std::shared_ptr local_used_work_; + c10::intrusive_ptr local_used_work_; void verify_replicas_within_process(); @@ -282,7 +283,7 @@ class Reducer { size_t pending; // Keep work handle around when this set of buckets is being reduced. - std::shared_ptr work; + c10::intrusive_ptr work; // Keep future work handle around if DDP comm hook is registered. c10::intrusive_ptr future_work; @@ -340,7 +341,7 @@ class Reducer { // A struct containing work handle and tensor for allreduce scheduled in // forward pass, if applicable. struct ForwardPassAllreduceWork { - std::shared_ptr workHandle; + c10::intrusive_ptr workHandle; at::Tensor resultTensor; // whether we should divide by the initial world_size or the no. of // remaining DDP ranks. diff --git a/torch/lib/c10d/test/FileStoreTest.cpp b/torch/lib/c10d/test/FileStoreTest.cpp index cc8da6326091..ce75c78adce7 100644 --- a/torch/lib/c10d/test/FileStoreTest.cpp +++ b/torch/lib/c10d/test/FileStoreTest.cpp @@ -41,7 +41,7 @@ std::string tmppath() { void testGetSet(std::string path, std::string prefix = "") { // Basic Set/Get on File Store { - auto fileStore = std::make_shared(path, 2); + auto fileStore = c10::make_intrusive(path, 2); c10d::PrefixStore store(prefix, fileStore); c10d::test::set(store, "key0", "value0"); c10d::test::set(store, "key1", "value1"); @@ -53,7 +53,7 @@ void testGetSet(std::string path, std::string prefix = "") { // Perform get on new instance { - auto fileStore = std::make_shared(path, 2); + auto fileStore = c10::make_intrusive(path, 2); c10d::PrefixStore store(prefix, fileStore); c10d::test::check(store, "key0", "value0"); } @@ -69,7 +69,8 @@ void stressTestStore(std::string path, std::string prefix = "") { for (auto i = 0; i < numThreads; i++) { threads.push_back(std::thread([&] { - auto fileStore = std::make_shared(path, numThreads + 1); + auto fileStore = + c10::make_intrusive(path, numThreads + 1); c10d::PrefixStore store(prefix, fileStore); sem1.post(); sem2.wait(); @@ -87,7 +88,7 @@ void stressTestStore(std::string path, std::string prefix = "") { // Check that the counter has the expected value { - auto fileStore = std::make_shared(path, numThreads + 1); + auto fileStore = c10::make_intrusive(path, numThreads + 1); c10d::PrefixStore store(prefix, fileStore); std::string expected = std::to_string(numThreads * numIterations); c10d::test::check(store, "counter", expected); diff --git a/torch/lib/c10d/test/HashStoreTest.cpp b/torch/lib/c10d/test/HashStoreTest.cpp index a16f83231a58..24b7fc76a417 100644 --- a/torch/lib/c10d/test/HashStoreTest.cpp +++ b/torch/lib/c10d/test/HashStoreTest.cpp @@ -11,7 +11,7 @@ void testGetSet(std::string prefix = "") { // Basic set/get { - auto hashStore = std::make_shared(); + auto hashStore = c10::make_intrusive(); c10d::PrefixStore store(prefix, hashStore); c10d::test::set(store, "key0", "value0"); c10d::test::set(store, "key1", "value1"); @@ -32,7 +32,7 @@ void testGetSet(std::string prefix = "") { // get() waits up to timeout_. { - auto hashStore = std::make_shared(); + auto hashStore = c10::make_intrusive(); c10d::PrefixStore store(prefix, hashStore); std::thread th([&]() { c10d::test::set(store, "key0", "value0"); }); c10d::test::check(store, "key0", "value0"); @@ -47,7 +47,7 @@ void stressTestStore(std::string prefix = "") { std::vector threads; c10d::test::Semaphore sem1, sem2; - auto hashStore = std::make_shared(); + auto hashStore = c10::make_intrusive(); c10d::PrefixStore store(prefix, hashStore); for (auto i = 0; i < numThreads; i++) { diff --git a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp index 92dede9a573e..091ea9b2ad07 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooAsyncTest.cpp @@ -45,7 +45,7 @@ class AsyncTest { } void start(int rank, int size) { - auto store = std::make_shared<::c10d::FileStore>(path_, size); + auto store = c10::make_intrusive<::c10d::FileStore>(path_, size); // Use tiny timeout to make this test run fast ::c10d::ProcessGroupGloo::Options options; @@ -93,7 +93,7 @@ class AsyncInputIsOutputTest : public AsyncTest { } } - void wait(std::shared_ptr& work) { + void wait(c10::intrusive_ptr& work) { at::cuda::CUDAMultiStreamGuard guard(streams_); work->wait(); } @@ -130,7 +130,7 @@ class AsyncAllreduceTest : public AsyncInputIsOutputTest { AsyncAllreduceTest(const std::string& path, int numTensors) : AsyncInputIsOutputTest(path, numTensors) {} - std::shared_ptr run() { + c10::intrusive_ptr run() { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -156,7 +156,7 @@ class AsyncBroadcastTest : public AsyncInputIsOutputTest { AsyncBroadcastTest(const std::string& path, int numTensors) : AsyncInputIsOutputTest(path, numTensors) {} - std::shared_ptr run(int rootRank, int rootTensor) { + c10::intrusive_ptr run(int rootRank, int rootTensor) { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -185,7 +185,7 @@ void runAsyncAllreduceTest( size_t numProcesses = 4, size_t numTensors = 2) { auto tests = initialize(path, numProcesses, numTensors); - std::vector> work(numProcesses); + std::vector> work(numProcesses); for (size_t i = 0; i < numProcesses; i++) { work[i] = tests[i].run(); } @@ -229,7 +229,7 @@ void runAsyncBroadcastTest( // Try every permutation of root rank and root tensor for (size_t rootRank = 0; rootRank < numProcesses; rootRank++) { for (size_t rootTensor = 0; rootTensor < numTensors; rootTensor++) { - std::vector> work(numProcesses); + std::vector> work(numProcesses); for (size_t i = 0; i < numProcesses; i++) { work[i] = tests[i].run(rootRank, rootTensor); } diff --git a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp index da4f9b5fc106..469cf32a8442 100644 --- a/torch/lib/c10d/test/ProcessGroupGlooTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupGlooTest.cpp @@ -44,8 +44,8 @@ class SignalTest { }); } - std::shared_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) { - auto store = std::make_shared<::c10d::FileStore>(path_, size); + c10::intrusive_ptr<::c10d::ProcessGroup::Work> run(int rank, int size) { + auto store = c10::make_intrusive<::c10d::FileStore>(path_, size); ::c10d::ProcessGroupGloo::Options options; // Set a timeout that is small enough to make this test run fast, but also @@ -62,7 +62,7 @@ class SignalTest { }; // Loop until an exception happens - std::shared_ptr<::c10d::ProcessGroup::Work> work; + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work; while (true) { work = pg.allreduce(tensors); try { @@ -82,7 +82,7 @@ class SignalTest { Semaphore sem_; }; -std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( +c10::intrusive_ptr<::c10d::ProcessGroup::Work> testSignal( const std::string& path, int signal) { Fork fork; @@ -101,13 +101,13 @@ std::shared_ptr<::c10d::ProcessGroup::Work> testSignal( class ProcessGroupGlooDelayed : public ::c10d::ProcessGroupGloo { public: ProcessGroupGlooDelayed( - const std::shared_ptr<::c10d::Store>& store, + const c10::intrusive_ptr<::c10d::Store>& store, int rank, int size, Options options) : ProcessGroupGloo(store, rank, size, options) {} - std::shared_ptr<::c10d::ProcessGroup::Work> send( + c10::intrusive_ptr<::c10d::ProcessGroup::Work> send( std::vector& tensors, int dstRank, int tag) override { @@ -151,7 +151,7 @@ class CollectiveTest { } void start(int rank, int size, bool delayed) { - auto store = std::make_shared<::c10d::FileStore>(path_, size); + auto store = c10::make_intrusive<::c10d::FileStore>(path_, size); // Set a timeout that is small enough to make this test run fast, but also // make sure that we don't get timeouts in the ProcessGroupGloo constructor. @@ -200,7 +200,7 @@ void testAllreduce(const std::string& path, const at::DeviceType b) { } // Kick off work - std::vector> work(size); + std::vector> work(size); for (auto i = 0; i < size; i++) { work[i] = tests[i].getProcessGroup().allreduce(inputs[i]); } @@ -250,7 +250,7 @@ void testBroadcast(const std::string& path, const at::DeviceType b) { options.rootTensor = j; // Kick off work - std::vector> work(size); + std::vector> work(size); for (auto i = 0; i < size; i++) { work[i] = tests[i].getProcessGroup().broadcast(inputs[i], options); } @@ -316,7 +316,7 @@ void testAlltoall(const std::string& path, const at::DeviceType b) { }; // Kick off work - std::vector> work(size); + std::vector> work(size); for (auto rank = 0; rank < size; rank++) { work[rank] = tests[rank].getProcessGroup().alltoall_base( outputs[rank], inputs[rank], outputSplits[rank], inputSplits[rank]); @@ -349,7 +349,7 @@ void testBarrier(const std::string& path) { auto tests = CollectiveTest::initialize(path, size); // Kick off work - std::vector> work(size); + std::vector> work(size); for (auto i = 0; i < size; i++) { work[i] = tests[i].getProcessGroup().barrier(); } diff --git a/torch/lib/c10d/test/ProcessGroupMPITest.cpp b/torch/lib/c10d/test/ProcessGroupMPITest.cpp index 3f5a9e4cf331..6c60b3d6742d 100644 --- a/torch/lib/c10d/test/ProcessGroupMPITest.cpp +++ b/torch/lib/c10d/test/ProcessGroupMPITest.cpp @@ -14,7 +14,7 @@ // Wait for work to complete void waitWork( std::shared_ptr pg, - std::vector> works) { + std::vector> works) { for (auto& work : works) { try { work->wait(); @@ -34,10 +34,11 @@ void testAllreduce(int iter = 1000) { allTensors[i] = std::vector({tensor}); } - std::vector> works; + std::vector> works; for (auto& tensors : allTensors) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->allreduce(tensors); + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = + pg->allreduce(tensors); works.push_back(std::move(work)); } @@ -73,10 +74,11 @@ void testBroadcast(int iter = 10000) { } } - std::vector> works; + std::vector> works; for (auto& tensors : allTensors) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->broadcast(tensors); + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = + pg->broadcast(tensors); works.push_back(std::move(work)); } @@ -104,10 +106,10 @@ void testReduce(int iter = 10000) { allTensors[i] = std::vector({tensor}); } - std::vector> works; + std::vector> works; for (auto& tensors : allTensors) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = pg->reduce(tensors); + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->reduce(tensors); works.push_back(std::move(work)); } @@ -150,10 +152,10 @@ void testAllgather(int iter = 10000) { } } - std::vector> works; + std::vector> works; for (size_t i = 0; i < allTensors.size(); ++i) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->allgather(allOutputTensors[i], allTensors[i]); works.push_back(std::move(work)); } @@ -198,10 +200,10 @@ void testGather(int iter = 10000) { } } - std::vector> works; + std::vector> works; for (size_t i = 0; i < allTensors.size(); ++i) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->gather(allOutputTensors[i], allTensors[i]); works.push_back(std::move(work)); } @@ -249,10 +251,10 @@ void testScatter(int iter = 1) { } } - std::vector> works; + std::vector> works; for (size_t i = 0; i < allTensors.size(); ++i) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->scatter(allTensors[i], allInputTensors[i]); works.push_back(std::move(work)); } @@ -289,27 +291,27 @@ void testSendRecv(bool recvAnysource, int iter = 10000) { } if (rank == 0) { - std::vector> works; + std::vector> works; for (auto& tensors : allTensors) { // Kick off work - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->send(tensors, 1, 0); works.push_back(std::move(work)); } waitWork(pg, works); } if (rank == 1) { - std::vector> works; + std::vector> works; std::vector srcRanks(allTensors.size(), -1); size_t i = 0; for (auto& tensors : allTensors) { // Kick off work if (!recvAnysource) { - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->recv(tensors, 0, 0); works.push_back(std::move(work)); } else { - std::shared_ptr<::c10d::ProcessGroup::Work> work = + c10::intrusive_ptr<::c10d::ProcessGroup::Work> work = pg->recvAnysource(tensors, 0); works.push_back(std::move(work)); } diff --git a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp index e906702a889d..e19981c523de 100644 --- a/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupNCCLErrorsTest.cpp @@ -37,7 +37,7 @@ class WorkNCCLSimulateErrors : public c10d::ProcessGroupNCCL::WorkNCCL { class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL { public: ProcessGroupNCCLSimulateErrors( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, c10d::ProcessGroupNCCL::Options opts) @@ -56,12 +56,12 @@ class ProcessGroupNCCLSimulateErrors : public c10d::ProcessGroupNCCL { ProcessGroupNCCLSimulateErrors::kWatchdogThreadSleepMillis); } - std::shared_ptr initWork( + c10::intrusive_ptr initWork( std::vector devices, int rank, c10d::OpType opType, const char* profilingTitle) override { - return std::make_shared( + return c10::make_intrusive( devices, simulate_error_, rank, opType); } @@ -106,19 +106,19 @@ class WorkNCCLTimedoutErrors : public c10d::ProcessGroupNCCL::WorkNCCL { class ProcessGroupNCCLTimedOutErrors : public ProcessGroupNCCLSimulateErrors { public: ProcessGroupNCCLTimedOutErrors( - const std::shared_ptr& store, + const c10::intrusive_ptr& store, int rank, int size, c10d::ProcessGroupNCCL::Options opts) : ProcessGroupNCCLSimulateErrors(store, rank, size, opts), set_timedout_error_(false) {} - std::shared_ptr initWork( + c10::intrusive_ptr initWork( std::vector devices, int rank, c10d::OpType opType, const char* profilingTitle) override { - return std::make_shared( + return c10::make_intrusive( devices, set_timedout_error_, rank, opType); } @@ -153,7 +153,7 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test { void SetUp() override { size_t numDevices = cudaNumDevices(); TemporaryFile file; - store_ = std::make_shared<::c10d::FileStore>(file.path, 1); + store_ = c10::make_intrusive<::c10d::FileStore>(file.path, 1); at::cuda::OptionalCUDAGuard deviceGuard; tensors_.resize(numDevices); @@ -168,7 +168,7 @@ class ProcessGroupNCCLErrorsTest : public ::testing::Test { } std::vector tensors_; - std::shared_ptr<::c10d::FileStore> store_; + c10::intrusive_ptr<::c10d::FileStore> store_; }; TEST_F(ProcessGroupNCCLErrorsTest, testNCCLErrorsBlocking) { diff --git a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp index 92b477fae7de..fa5e988273fc 100644 --- a/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp +++ b/torch/lib/c10d/test/ProcessGroupNCCLTest.cpp @@ -31,7 +31,7 @@ class NCCLTestBase { } void initialize(int rank, int size) { - auto store = std::make_shared<::c10d::FileStore>(path_, size); + auto store = c10::make_intrusive<::c10d::FileStore>(path_, size); pg_ = std::unique_ptr<::c10d::ProcessGroupNCCL>( new ::c10d::ProcessGroupNCCL(store, rank, size)); @@ -80,7 +80,7 @@ class NCCLTest : public NCCLTestBase { } void wait( - std::shared_ptr& work, + c10::intrusive_ptr& work, std::chrono::milliseconds timeout = kNoTimeout) { at::cuda::CUDAMultiStreamGuard guard(streams_); work->wait(timeout); @@ -166,7 +166,7 @@ class AllreduceNCCLTest : public NCCLTest { AllreduceNCCLTest(const std::string& path, int worldSize) : NCCLTest(path, worldSize) {} - std::shared_ptr run() { + c10::intrusive_ptr run() { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -189,7 +189,7 @@ class BroadcastNCCLTest : public NCCLTest { BroadcastNCCLTest(const std::string& path, int worldSize) : NCCLTest(path, worldSize) {} - std::shared_ptr run(int rootRank, int rootTensor) { + c10::intrusive_ptr run(int rootRank, int rootTensor) { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -208,7 +208,7 @@ class ReduceNCCLTest : public NCCLTest { ReduceNCCLTest(const std::string& path, int worldSize) : NCCLTest(path, worldSize) {} - std::shared_ptr run(int rootRank, int rootTensor) { + c10::intrusive_ptr run(int rootRank, int rootTensor) { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -227,7 +227,7 @@ class AllgatherNCCLTest : public NCCLTest { AllgatherNCCLTest(const std::string& path, int worldSize) : NCCLTest(path, worldSize) {} - std::shared_ptr run() { + c10::intrusive_ptr run() { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); @@ -242,7 +242,7 @@ struct ReduceScatterNCCLTest : NCCLTest { ReduceScatterNCCLTest(const std::string& path, int worldSize) : NCCLTest(path, worldSize) {} - std::shared_ptr run() { + c10::intrusive_ptr run() { // For the duration of this function, make THC use our streams at::cuda::CUDAMultiStreamGuard guard(streams_); diff --git a/torch/lib/c10d/test/TCPStoreTest.cpp b/torch/lib/c10d/test/TCPStoreTest.cpp index 0cfa72c7801a..8073ec0345e0 100644 --- a/torch/lib/c10d/test/TCPStoreTest.cpp +++ b/torch/lib/c10d/test/TCPStoreTest.cpp @@ -16,7 +16,7 @@ void testHelper(const std::string& prefix = "") { const auto numThreads = 16; const auto numWorkers = numThreads + 1; - auto serverTCPStore = std::make_shared( + auto serverTCPStore = c10::make_intrusive( "127.0.0.1", 0, numWorkers, @@ -25,7 +25,7 @@ void testHelper(const std::string& prefix = "") { /* wait */ false); auto serverStore = - std::make_unique(prefix, serverTCPStore); + c10::make_intrusive(prefix, serverTCPStore); // server store auto serverThread = std::thread([&serverStore, &serverTCPStore] { // Wait for all workers to join. @@ -64,13 +64,13 @@ void testHelper(const std::string& prefix = "") { c10d::test::Semaphore sem1, sem2; // Each thread will have a client store to send/recv data - std::vector> clientTCPStores; - std::vector> clientStores; + std::vector> clientTCPStores; + std::vector> clientStores; for (auto i = 0; i < numThreads; i++) { - clientTCPStores.push_back(std::make_unique( + clientTCPStores.push_back(c10::make_intrusive( "127.0.0.1", serverTCPStore->getPort(), numWorkers, false)); - clientStores.push_back(std::unique_ptr( - new c10d::PrefixStore(prefix, clientTCPStores[i]))); + clientStores.push_back( + c10::make_intrusive(prefix, clientTCPStores[i])); } std::string expectedCounterRes = std::to_string(numThreads * numIterations + 1); diff --git a/torch/onnx/symbolic_opset9.py b/torch/onnx/symbolic_opset9.py index b917300c624e..d06203cb4508 100644 --- a/torch/onnx/symbolic_opset9.py +++ b/torch/onnx/symbolic_opset9.py @@ -209,6 +209,44 @@ def matmul(g, self, other): @parse_args('v', 'v', 'v', 't', 't') def addmm(g, self, mat1, mat2, beta, alpha): + dtype = None + self_dtype = self.type().scalarType() + mat1_dtype = mat1.type().scalarType() + mat2_dtype = mat2.type().scalarType() + if self_dtype is not None: + dtype = self_dtype + elif mat1_dtype is not None: + dtype = mat1_dtype + elif mat2_dtype is not None: + dtype = mat2_dtype + + mat1_rank = mat1.type().dim() + mat2_rank = mat2.type().dim() + + def isNotNoneAnd(v, u): + return v is not None and v != u + + if dtype is not None and (isNotNoneAnd(mat1_rank, 2) or isNotNoneAnd(mat2_rank, 2)): + dtype = sym_help.scalar_type_to_onnx.index(sym_help.cast_pytorch_to_onnx[dtype]) + dtype = sym_help.scalar_type_to_pytorch_type[dtype] + + res1 = g.op("MatMul", mat1, mat2) + res2 = self + + alpha = sym_help._scalar(alpha) + beta = sym_help._scalar(beta) + + if alpha != 1: + alpha = g.op("Constant", + value_t=torch.tensor(alpha, dtype=dtype)) + res1 = g.op("Mul", res1, alpha) + if beta != 1: + beta = g.op("Constant", + value_t=torch.tensor(sym_help._scalar(beta), dtype=dtype)) + res2 = g.op("Mul", res2, beta) + + return g.op("Add", res1, res2) + return g.op("Gemm", mat1, mat2, self, beta_f=sym_help._scalar(beta), alpha_f=sym_help._scalar(alpha)) @@ -1110,7 +1148,8 @@ def log_softmax(g, input, dim, dtype=None): dim = input_dim - 1 return_op = g.op("LogSoftmax", input, axis_i=dim) if dtype and dtype.node().kind() != 'prim::Constant': - return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[dtype]) + parsed_dtype = sym_help._get_const(dtype, 'i', 'dtype') + return_op = g.op("Cast", return_op, to_i=sym_help.scalar_type_to_onnx[parsed_dtype]) if is_transpose_required: return_op = g.op("Transpose", return_op, perm_i=axes) return return_op @@ -1645,10 +1684,22 @@ def new_full(g, self, size, fill_value, dtype, layout, device, pin_memory=False) return full(g, size, fill_value, dtype, layout, device, pin_memory) -def eye(g, n, m, dtype=None, layout=None, device=None, pin_memory=False): - shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0) - tensor = zeros(g, shape, dtype, layout, device) - return g.op("EyeLike", tensor) +def eye(g, *args): + if len(args) == 5: + # aten::eye(n, dtype, layout, device, pin_memory) + n, dtype, layout, device, pin_memory = args + dim_size = g.op("Unsqueeze", n, axes_i=[0]) + shape = g.op("Concat", dim_size, dim_size, axis_i=0) + tensor = zeros(g, shape, dtype, layout, device) + return g.op("EyeLike", tensor) + elif len(args) == 6: + # aten::eye(n, m, dtype, layout, device, pin_memory) + n, m, dtype, layout, device, pin_memory = args + shape = g.op("Concat", g.op("Unsqueeze", n, axes_i=[0]), g.op("Unsqueeze", m, axes_i=[0]), axis_i=0) + tensor = zeros(g, shape, dtype, layout, device) + return g.op("EyeLike", tensor) + else: + raise NotImplementedError("Unknown aten::eye signature") def slice(g, self, *args): diff --git a/torch/quantization/fx/quantize.py b/torch/quantization/fx/quantize.py index 35e3e1ac8efb..b87612c97dbe 100644 --- a/torch/quantization/fx/quantize.py +++ b/torch/quantization/fx/quantize.py @@ -347,9 +347,10 @@ def _prepare(self, model, qconfig_dict, prepare_custom_config_dict, is_standalon # match the patterns that will get quantized standalone_module_names = prepare_custom_config_dict.get("standalone_module_name", None) + standalone_module_classes = prepare_custom_config_dict.get("standalone_module_class", None) custom_module_classes = get_custom_module_class_keys(prepare_custom_config_dict, "float_to_observed_custom_module_class") matches = self._find_matches( - model.graph, self.modules, self.patterns, standalone_module_names, custom_module_classes) + model.graph, self.modules, self.patterns, standalone_module_names, standalone_module_classes, custom_module_classes) # find _inputs_ to matched nodes that are not quantized, these # have to be quantized, which requires measuring stats, @@ -826,7 +827,9 @@ def convert(self, model, debug=False, convert_custom_config_dict=None, is_standa def _find_matches( self, graph, modules, patterns, - standalone_module_names=None, custom_module_classes=None): + standalone_module_names=None, + standalone_module_classes=None, + custom_module_classes=None): """ Matches the nodes in the input graph to quantization patterns, and outputs the information needed to quantize them in future steps. @@ -850,6 +853,12 @@ def _find_matches( if custom_module_classes is None: custom_module_classes = [] + if standalone_module_classes is None: + standalone_module_classes = [] + + if standalone_module_names is None: + standalone_module_names = [] + match_map = {} all_matched = set() @@ -883,10 +892,9 @@ def record_match(pattern, node, matched): match_map[node.name] = ( node, [node], None, CustomModuleQuantizeHandler(self, node), custom_module_qconfig) - def is_standalone_module(module_path): - if standalone_module_names is None: - return False - return module_path in standalone_module_names + def is_standalone_module(node_target): + return node_target in standalone_module_names or \ + type(self.modules[node_target]) in standalone_module_classes # add standalone modules to the match for node in graph.nodes: diff --git a/torch/quantization/observer.py b/torch/quantization/observer.py index 4dac8fb68429..fbd8168393c8 100644 --- a/torch/quantization/observer.py +++ b/torch/quantization/observer.py @@ -258,9 +258,9 @@ def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tu min_val_neg = torch.min(min_val, torch.zeros_like(min_val)) max_val_pos = torch.max(max_val, torch.zeros_like(max_val)) - scale = torch.ones(min_val_neg.size(), dtype=torch.float32) - zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64) - device = 'cuda' if min_val_neg.is_cuda else 'cpu' + device = min_val_neg.device + scale = torch.ones(min_val_neg.size(), dtype=torch.float32, device=device) + zero_point = torch.zeros(min_val_neg.size(), dtype=torch.int64, device=device) if self.qscheme == torch.per_tensor_symmetric or self.qscheme == torch.per_channel_symmetric: max_val_pos = torch.max(-min_val_neg, max_val_pos) @@ -297,7 +297,6 @@ def _calculate_qparams(self, min_val: torch.Tensor, max_val: torch.Tensor) -> Tu if self.qscheme == torch.per_channel_affine_float_qparams: zero_point = torch.tensor([float(zero_point)], dtype=zero_point.dtype, device=device) - return scale, zero_point diff --git a/torch/quantization/quantize_fx.py b/torch/quantization/quantize_fx.py index 93043559bf48..91d58c2966a4 100644 --- a/torch/quantization/quantize_fx.py +++ b/torch/quantization/quantize_fx.py @@ -75,6 +75,9 @@ def _prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None, is_standal # standalone module and custom module config are applied in top level module standalone_module_names = prepare_custom_config_dict.get('standalone_module_name', []) skipped_module_names += standalone_module_names + + standalone_module_classes = prepare_custom_config_dict.get('standalone_module_class', []) + skipped_module_classes += standalone_module_classes float_custom_module_classes = get_custom_module_class_keys( prepare_custom_config_dict, "float_to_observed_custom_module_class") skipped_module_classes += float_custom_module_classes @@ -170,6 +173,11 @@ def prepare_fx(model, qconfig_dict, prepare_custom_config_dict=None): "standalone_module_name": [ "submodule.standalone" ], + + "standalone_module_class": [ + StandaloneModule + ], + # user will manually define the corresponding observed # module class which has a from_float class method that converts # float custom module to observed custom module diff --git a/torch/tensor.py b/torch/tensor.py index 64e7d9ee44c0..b3cb2890fde9 100644 --- a/torch/tensor.py +++ b/torch/tensor.py @@ -1040,7 +1040,8 @@ def _convert(ret, cls): if isinstance(ret, Tensor): ret = ret.as_subclass(cls) - if isinstance(ret, tuple): - ret = tuple(_convert(r, cls) for r in ret) + if isinstance(ret, (tuple, list)): + # Also handles things like namedtuples + ret = type(ret)(_convert(r, cls) for r in ret) return ret diff --git a/torch/testing/__init__.py b/torch/testing/__init__.py index 80120b019a99..c26556f4d70a 100644 --- a/torch/testing/__init__.py +++ b/torch/testing/__init__.py @@ -212,7 +212,8 @@ def assert_allclose(actual, expected, rtol=None, atol=None, equal_nan=True, msg= if not isinstance(expected, torch.Tensor): expected = torch.tensor(expected, dtype=actual.dtype) if expected.shape != actual.shape: - expected = expected.expand_as(actual) + raise AssertionError("expected tensor shape {0} doesn't match with actual tensor " + "shape {1}!".format(expected.shape, actual.shape)) if rtol is None or atol is None: if rtol is not None or atol is not None: raise ValueError("rtol and atol must both be specified or both be unspecified") diff --git a/torch/testing/_internal/common_methods_invocations.py b/torch/testing/_internal/common_methods_invocations.py index c61f6e709afe..c409b5265a67 100644 --- a/torch/testing/_internal/common_methods_invocations.py +++ b/torch/testing/_internal/common_methods_invocations.py @@ -14,7 +14,7 @@ floating_types, floating_types_and, floating_and_complex_types, floating_and_complex_types_and, all_types_and_complex_and, all_types_and) from torch.testing._internal.common_device_type import \ - (skipCUDAIfNoMagma, skipCPUIfNoLapack, expectedFailureCUDA, + (skipCUDAIfNoMagma, skipCPUIfNoLapack, expectedAlertNondeterministic, precisionOverride) from torch.testing._internal.common_utils import \ (prod_single_zero, random_square_matrix_of_rank, @@ -867,11 +867,9 @@ def method_tests(): ('kthvalue', (S, S, S), (2, 1, True,), 'keepdim_dim', (), [1]), ('kthvalue', (S,), (2, 0,), 'dim_1d', (), [1]), ('kthvalue', (S,), (2, 0, True,), 'keepdim_dim_1d', (), [1]), - # TODO: https://github.com/pytorch/pytorch/issues/30818 - ('kthvalue', (), (1,), 'scalar', (), (), [expectedFailureCUDA]), - ('kthvalue', (), (1, 0,), 'scalar_dim', (), [1], [expectedFailureCUDA]), - ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1], [expectedFailureCUDA]), - # END TODO + ('kthvalue', (), (1,), 'scalar', (), ()), + ('kthvalue', (), (1, 0,), 'scalar_dim', (), [1]), + ('kthvalue', (), (1, 0, True), 'scalar_keepdim_dim', (), [1]), ('quantile', (S, S, S), (0.5,)), ('quantile', (S, S, S), (0.5, 0), 'dim', (), [1]), ('quantile', (S, S, S), (0.5, None, True), 'keepdim'), diff --git a/torch/testing/_internal/distributed/distributed_test.py b/torch/testing/_internal/distributed/distributed_test.py index 9ce1c58cb4da..21fd1a2a88e4 100644 --- a/torch/testing/_internal/distributed/distributed_test.py +++ b/torch/testing/_internal/distributed/distributed_test.py @@ -4111,3 +4111,145 @@ def forward(_self, input, expected_type): # noqa inp = TestNamedTupleInput_1(a, b) model(inp, type(inp)) + + @require_backend({"gloo", "nccl"}) + @require_backends_available({"gloo", "nccl"}) + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_control_flow_same_across_ranks(self): + # Control flow that is the same across ranks. + batch = 20 + dim = 10 + + class ToyModel(nn.Module): + def __init__(self): + super(ToyModel, self).__init__() + self.lin1 = nn.Linear(10, 10, bias=False) + self.lin2 = nn.Linear(10, 10, bias=False) + + def forward(self, x): + # Second layer is used dependent on input x. + use_second_layer = torch.equal( + x, torch.ones(batch, dim, device=x.device) + ) + if use_second_layer: + return self.lin2(F.relu(self.lin1(x))) + else: + return F.relu(self.lin1(x)) + + world_size = dist.get_world_size() + torch.cuda.set_device(self.rank) + model = torch.nn.parallel.DistributedDataParallel( + ToyModel().cuda(self.rank), + device_ids=[self.rank], + find_unused_parameters=True, + ) + random_input = torch.randn(batch, dim, device=self.rank) + ones_input = torch.ones(batch, dim, device=self.rank) + for i in range(6): + if i % 2 == 0: + out = model(random_input) + else: + out = model(ones_input) + loss = out.sum() + loss.backward() + # On even iterations, 2nd param goes unused, on odd iterations, + # it is used. + local_used_maps = model.reducer._get_local_used_maps() + if i % 2 == 0: + expected = torch.tensor([world_size, 0], device=self.rank, dtype=torch.int32) + else: + expected = torch.tensor([world_size, world_size], device=self.rank, dtype=torch.int32) + + # Validate parameter usage. + variable_usage_tensor = local_used_maps[0] + self.assertEqual(variable_usage_tensor, expected) + + # Validate appropriate error message when DDP is used with + # find_unused_parameters=False. + model = torch.nn.parallel.DistributedDataParallel( + ToyModel().cuda(self.rank), + device_ids=[self.rank], + find_unused_parameters=False, + ) + for i in range(2): + with self.assertRaisesRegex( + RuntimeError, + "Expected to have finished reduction in the prior iteration before starting a new one", + ) if i == 1 else suppress(): + loss = model(random_input).sum() + loss.backward() + + @require_backend({"gloo", "nccl"}) + @require_backends_available({"gloo", "nccl"}) + @skip_if_lt_x_gpu(2) + @skip_if_rocm + def test_ddp_control_flow_different_across_ranks(self): + # Control flow that is different across ranks. + batch = 20 + dim = 10 + + class ToyModel(nn.Module): + def __init__(self, rank): + super(ToyModel, self).__init__() + self.lin1 = nn.Linear(10, 10, bias=False) + self.lin2 = nn.Linear(10, 10, bias=False) + self.rank = rank + + def forward(self, x): + # Control-flow that is rank and input dependent for the + # model. + use_second_layer = ( + torch.equal(x, torch.ones(batch, dim, device=x.device)) + and self.rank == 1 + ) + + if use_second_layer: + return self.lin2(F.relu(self.lin1(x))) + else: + return F.relu(self.lin1(x)) + + world_size = dist.get_world_size() + torch.cuda.set_device(self.rank) + model = torch.nn.parallel.DistributedDataParallel( + ToyModel(self.rank).cuda(self.rank), + device_ids=[self.rank], + find_unused_parameters=True, + ) + random_input = torch.randn(batch, dim, device=self.rank) + ones_input = torch.ones(batch, dim, device=self.rank) + for i in range(6): + if i % 2 == 0: + out = model(random_input) + else: + out = model(ones_input) + loss = out.sum() + loss.backward() + # On even iterations, 2nd param goes unused, on odd iterations, + # it is used only on rank 1. + local_used_maps = model.reducer._get_local_used_maps() + + if i % 2 == 0: + expected = torch.tensor([world_size, 0], device=self.rank, dtype=torch.int32) + else: + expected = torch.tensor([world_size, 1], device=self.rank, dtype=torch.int32) + + variable_usage_tensor = local_used_maps[0] + # Validate parameter usage. On odd iterations, 2nd param is only + # used on rank 1. + self.assertEqual(variable_usage_tensor, expected) + + # Validate appropriate error message when DDP is used with + # find_unused_parameters=False. + model = torch.nn.parallel.DistributedDataParallel( + ToyModel(self.rank).cuda(self.rank), + device_ids=[self.rank], + find_unused_parameters=False, + ) + for i in range(2): + with self.assertRaisesRegex( + RuntimeError, + "Expected to have finished reduction in the prior iteration before starting a new one", + ) if i == 1 else suppress(): + loss = model(random_input).sum() + loss.backward() diff --git a/torch/utils/cpp_extension.py b/torch/utils/cpp_extension.py index d0ff0500063a..59657e49f427 100644 --- a/torch/utils/cpp_extension.py +++ b/torch/utils/cpp_extension.py @@ -562,7 +562,7 @@ def spawn(cmd): else: cflags = [] - cflags = win_cuda_flags(cflags) + cflags = win_cuda_flags(cflags) + ['--use-local-env'] for flag in COMMON_MSVC_FLAGS: cflags = ['-Xcompiler', flag] + cflags for ignore_warning in MSVC_IGNORE_CUDAFE_WARNINGS: @@ -632,7 +632,7 @@ def win_wrap_ninja_compile(sources, cuda_post_cflags = None cuda_cflags = None if with_cuda: - cuda_cflags = [] + cuda_cflags = ['--use-local-env'] for common_cflag in common_cflags: cuda_cflags.append('-Xcompiler') cuda_cflags.append(common_cflag) @@ -1429,9 +1429,17 @@ def _get_cuda_arch_flags(cflags: Optional[List[str]] = None) -> List[str]: # See cmake/Modules_CUDA_fix/upstream/FindCUDA/select_compute_arch.cmake _arch_list = os.environ.get('TORCH_CUDA_ARCH_LIST', None) - # If not given, determine what's needed for the GPU that can be found + # If not given, determine what's best for the GPU / CUDA version that can be found if not _arch_list: capability = torch.cuda.get_device_capability() + supported_sm = [int(arch.split('_')[1]) + for arch in torch.cuda.get_arch_list() if 'sm_' in arch] + max_supported_sm = max((sm // 10, sm % 10) for sm in supported_sm) + # Capability of the device may be higher than what's supported by the user's + # NVCC, causing compilation error. User's NVCC is expected to match the one + # used to build pytorch, so we use the maximum supported capability of pytorch + # to clamp the capability. + capability = min(max_supported_sm, capability) arch_list = [f'{capability[0]}.{capability[1]}'] else: # Deal with lists that are ' ' separated (only deal with ';' after) diff --git a/torch/utils/show_pickle.py b/torch/utils/show_pickle.py index 0e2498d64c56..9e55ebff48b9 100644 --- a/torch/utils/show_pickle.py +++ b/torch/utils/show_pickle.py @@ -68,6 +68,7 @@ def persistent_load(self, pid): def dump(cls, in_stream, out_stream): value = cls(in_stream).load() pprint.pprint(value, stream=out_stream) + return value def main(argv, output_stream=None):